binh99 commited on
Commit
8c5ce8c
·
1 Parent(s): d037cdf
chains/__pycache__/multi_queries.cpython-39.pyc ADDED
Binary file (1.43 kB). View file
 
chains/__pycache__/openai_model.cpython-39.pyc CHANGED
Binary files a/chains/__pycache__/openai_model.cpython-39.pyc and b/chains/__pycache__/openai_model.cpython-39.pyc differ
 
chains/multi_queries.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains.llm import LLMChain
2
+ from langchain.prompts.chat import (
3
+ ChatPromptTemplate,
4
+ SystemMessagePromptTemplate,
5
+ HumanMessagePromptTemplate)
6
+ from prompts.multi_queries import system_template, human_template
7
+ from config import OPENAI_API_TYPE, OPENAI_API_VERSION, OPENAI_API_KEY, OPENAI_API_BASE, DEPLOYMENT_ID
8
+ from chains.azure_openai import CustomAzureOpenAI
9
+
10
+ class MultiQueries(LLMChain):
11
+ llm = CustomAzureOpenAI(deployment_name=DEPLOYMENT_ID,
12
+ openai_api_type=OPENAI_API_TYPE,
13
+ openai_api_base=OPENAI_API_BASE,
14
+ openai_api_version=OPENAI_API_VERSION,
15
+ openai_api_key=OPENAI_API_KEY,
16
+ temperature=0.0)
17
+ prompt = ChatPromptTemplate.from_messages(
18
+ [
19
+ SystemMessagePromptTemplate.from_template(system_template),
20
+ HumanMessagePromptTemplate.from_template(human_template)
21
+ ])
22
+ if __name__ == "__main__":
23
+ queries_chain = MultiQueries()
24
+ out = queries_chain.predict(question="Where can I request for my event's permit in Penang?")
25
+ print(out.strip().split('\n\n')[1])
26
+ print(list(map(lambda x: x.split(': ')[-1], out.split('\n\n'))))
chains/openai_model.py CHANGED
@@ -104,7 +104,8 @@ class OpenAIModel:
104
  from chains.web_search import GoogleWebSearch
105
  from config import GOOGLE_API_KEY, GOOGLE_CSE_ID, CUSTOM_API_KEY, CUSTOM_CSE_ID
106
  from chains.summary import WebSummary
107
-
 
108
  status_text = "Retrieving information from the web"
109
  yield chatbot, status_text
110
  if use_websearch:
@@ -114,23 +115,37 @@ class OpenAIModel:
114
  google_api_key = CUSTOM_API_KEY
115
  google_cse_id = CUSTOM_CSE_ID
116
  search = GoogleSearchAPIWrapper(google_api_key=google_api_key, google_cse_id=google_cse_id)
117
- results = search.results(inputs,4)
 
 
 
 
 
 
 
 
118
  reference_results = []
119
  display_append = []
120
  for idx, result in enumerate(results):
121
- print(result['link'])
122
- response = requests.get(result['link'])
123
- soup = BeautifulSoup(response.content, "html.parser")
124
  try:
125
- summary = WebSummary.predict(question=inputs, doc=soup.get_text())
126
- print("Can access", result['link'])
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  except:
128
- print("Cannot access ", result['link'])
129
- summary = result['snippet']
130
- reference_results.append([summary, result['link']])
131
- display_append.append(
132
- f"<a href=\"{result['link']}\" target=\"_blank\">{idx+1}.&nbsp;{result['title']}</a>"
133
- )
134
  reference_results = add_source_numbers(reference_results)
135
  display_append = '<div class = "source-a">' + "\n".join(display_append) + '</div>'
136
 
@@ -181,8 +196,10 @@ class OpenAIModel:
181
 
182
  status_text = "Request URL: " + OPENAI_API_BASE
183
  yield chatbot, status_text
184
-
185
  # Create a funciton to call - this will run in a thread
 
 
 
186
  def task():
187
  # Converation + RetrivalChain
188
  qa = CustomConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(k=5),
@@ -191,10 +208,12 @@ class OpenAIModel:
191
  combine_docs_chain_kwargs={"prompt": qa_prompt},
192
  return_source_documents=True)
193
  # query with input and chat history
194
- global response
195
  response = qa({"question": inputs, "chat_history": self.history})
 
 
196
  q.put(job_done)
197
-
 
198
  thread = Thread(target=task)
199
  thread.start()
200
  chatbot.append((inputs, ""))
@@ -211,6 +230,8 @@ class OpenAIModel:
211
  continue
212
 
213
  # add citation info to response
 
 
214
  relevant_docs = response["source_documents"]
215
  reference_results = [d.page_content for d in relevant_docs]
216
  display_append = add_details(reference_results)
 
104
  from chains.web_search import GoogleWebSearch
105
  from config import GOOGLE_API_KEY, GOOGLE_CSE_ID, CUSTOM_API_KEY, CUSTOM_CSE_ID
106
  from chains.summary import WebSummary
107
+ from chains.multi_queries import MultiQueries
108
+
109
  status_text = "Retrieving information from the web"
110
  yield chatbot, status_text
111
  if use_websearch:
 
115
  google_api_key = CUSTOM_API_KEY
116
  google_cse_id = CUSTOM_CSE_ID
117
  search = GoogleSearchAPIWrapper(google_api_key=google_api_key, google_cse_id=google_cse_id)
118
+
119
+ queries_chain = MultiQueries()
120
+ out = queries_chain.predict(question=inputs)
121
+ queries = list(map(lambda x: x.split(': ')[-1], out.split('\n\n')))
122
+ print(queries)
123
+ results = []
124
+ for query in queries:
125
+ search_rs = search.results(query, 2)
126
+ results.extend(search_rs)
127
  reference_results = []
128
  display_append = []
129
  for idx, result in enumerate(results):
 
 
 
130
  try:
131
+ head = requests.head(result['link'])
132
+ if "text/html" in head.headers['Content-Type']:
133
+ html_response = requests.get(result['link'])
134
+ soup = BeautifulSoup(html_response.content, "html.parser")
135
+ try:
136
+ web_summary = WebSummary()
137
+ summary = web_summary.predict(question=inputs, doc=soup.get_text())
138
+ print("Can access", result['link'])
139
+ except:
140
+ print("Cannot access ", result['link'])
141
+ summary = result['snippet']
142
+ reference_results.append([summary, result['link']])
143
+ display_append.append(
144
+ f"<a href=\"{result['link']}\" target=\"_blank\">{idx+1}.&nbsp;{result['title']}</a>"
145
+ )
146
  except:
147
+ continue
148
+
 
 
 
 
149
  reference_results = add_source_numbers(reference_results)
150
  display_append = '<div class = "source-a">' + "\n".join(display_append) + '</div>'
151
 
 
196
 
197
  status_text = "Request URL: " + OPENAI_API_BASE
198
  yield chatbot, status_text
 
199
  # Create a funciton to call - this will run in a thread
200
+
201
+ # Create a Queue object
202
+ response_queue = SimpleQueue()
203
  def task():
204
  # Converation + RetrivalChain
205
  qa = CustomConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(k=5),
 
208
  combine_docs_chain_kwargs={"prompt": qa_prompt},
209
  return_source_documents=True)
210
  # query with input and chat history
 
211
  response = qa({"question": inputs, "chat_history": self.history})
212
+ # Put response in the queue
213
+ response_queue.put(response)
214
  q.put(job_done)
215
+
216
+
217
  thread = Thread(target=task)
218
  thread.start()
219
  chatbot.append((inputs, ""))
 
230
  continue
231
 
232
  # add citation info to response
233
+ # Get the response from the queue
234
+ response = response_queue.get()
235
  relevant_docs = response["source_documents"]
236
  reference_results = [d.page_content for d in relevant_docs]
237
  display_append = add_details(reference_results)
prompts/__pycache__/multi_queries.cpython-39.pyc ADDED
Binary file (336 Bytes). View file
 
prompts/multi_queries.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ system_template = "You are an AI helpful assistant"
2
+
3
+ human_template = """Translate the following original question into English and Malaysian.
4
+ original question:{question}"""
test.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pinecone
2
+ import os
3
+ import PyPDF2
4
+ import shutil
5
+ import gradio as gr
6
+
7
+ from tqdm import tqdm
8
+ from pydantic import Field
9
+ from typing import List, Optional
10
+ from langchain.load.serializable import Serializable
11
+
12
+ from langchain.vectorstores import Pinecone
13
+ from config import PINECONE_API_KEY, PINECONE_ENVIRONMENT, INDEX_NAME, SAVE_DIR
14
+ from config import OPENAI_API_BASE, OPENAI_API_KEY, OPENAI_API_TYPE, OPENAI_API_VERSION, EMBEDDING_DEPLOYMENT_ID
15
+ from langchain.embeddings import OpenAIEmbeddings
16
+ from langchain.text_splitter import TokenTextSplitter
17
+
18
+ class Document(Serializable):
19
+ """Class for storing a piece of text and associated metadata."""
20
+
21
+ page_content: str
22
+ """String text."""
23
+ metadata: dict = Field(default_factory=dict)
24
+ """Arbitrary metadata about the page content (e.g., source, relationships to other
25
+ documents, etc.).
26
+ """
27
+
28
+ filepath = "documents\STANDARD_SOFTWARE LIFECYCLES.pdf"
29
+ pdftext = ""
30
+ text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=30)
31
+ doc_chunks = []
32
+ documents = []
33
+ with open(filepath, "rb") as pdfFileObj:
34
+ pdf_reader = PyPDF2.PdfReader(pdfFileObj)
35
+ for page in tqdm(pdf_reader.pages):
36
+ pdftext += page.extract_text()
37
+ texts = [Document(page_content=pdftext, metadata={"source": filepath})]
38
+ texts = text_splitter.split_documents(texts)
39
+ documents.extend(texts)
40
+ print(documents[:3])
41
+ # for (idx, docs) in enumerate(documents):
42
+ # docs.page_content = f"[{idx}] " + docs.page_content
43
+ def add_source_numbers(lst, source_name = "Source", use_source = True):
44
+ if use_source:
45
+ return [f'[{idx+1}]\t "{item[0]}"\n{source_name}: {item[1]}' for idx, item in enumerate(lst)]
46
+ else:
47
+ return [f'[{idx+1}]\t "{item}"' for idx, item in enumerate(lst)]
48
+
49
+ for (idx, d) in enumerate(documents):
50
+ item = [d.page_content.strip("�"), os.path.basename(d.metadata["source"])]
51
+ d.page_content = f'[{idx+1}]\t "{item[0]}"\nSource: {item[1]}'
52
+ # print(reference_results)
53
+ # print("----------------")
54
+ # print(documents[:3])
55
+
56
+ def add_details(lst):
57
+ nodes = []
58
+ for txt in lst:
59
+ brief = txt[:25].replace("\n", "")
60
+ nodes.append(
61
+ f"<details><summary>{brief}...</summary><p>{txt}</p></details>"
62
+ )
63
+ return nodes
64
+ reference_results = [d.page_content for d in documents[:3]]
65
+ display_append = add_details(reference_results)
66
+ print(display_append)