Wuttipong8146 commited on
Commit
403507a
·
verified ·
1 Parent(s): d8a173d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -0
app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pdfplumber
3
+ from langchain import PromptTemplate, LLMChain
4
+ from langchain.chat_models import HuggingFaceHub
5
+ from langchain.chains import ConversationalRetrievalChain
6
+ from langchain.vectorstores import Chroma
7
+ from langchain.document_loaders import TextLoader
8
+ from langchain.embeddings import HuggingFaceEmbeddings
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain.memory import ConversationBufferMemory
11
+ from transformers import AutoTokenizer, AutoModelForQuestionAnswering
12
+
13
+ # โหลดโมเดล ThaiBERT จาก Hugging Face
14
+ tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")
15
+ model = AutoModelForQuestionAnswering.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")
16
+
17
+ # ฟังก์ชันสำหรับอ่านเนื้อหาจาก PDF
18
+ def extract_text_from_pdf(pdf_file):
19
+ with pdfplumber.open(pdf_file) as pdf:
20
+ text = ""
21
+ for page in pdf.pages:
22
+ text += page.extract_text()
23
+ return text
24
+
25
+ # ฟังก์ชันสำหรับการตอบคำถามด้วย ThaiBERT
26
+ def answer_question(question, context):
27
+ inputs = tokenizer.encode_plus(question, context, return_tensors="pt")
28
+ answer_start_scores, answer_end_scores = model(**inputs)
29
+ answer_start = torch.argmax(answer_start_scores)
30
+ answer_end = torch.argmax(answer_end_scores) + 1
31
+ answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
32
+ return answer
33
+
34
+ # ตั้งค่าอินเตอร์เฟสของหน้าเว็บด้วย Streamlit
35
+ st.title("ThaiBERT PDF QA System")
36
+ uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
37
+
38
+ if uploaded_file:
39
+ # อ่านเนื้อหาจาก PDF
40
+ pdf_text = extract_text_from_pdf(uploaded_file)
41
+
42
+ # สร้าง chain สำหรับถามตอบ
43
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
44
+ docs = text_splitter.create_documents([pdf_text])
45
+
46
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-xlm-r-multilingual-v1")
47
+ vector_store = Chroma.from_documents(documents=docs, embedding=embeddings)
48
+
49
+ retriever = vector_store.as_retriever()
50
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
51
+
52
+ qa_chain = ConversationalRetrievalChain(
53
+ retriever=retriever,
54
+ llm=HuggingFaceHub(repo_id="airesearch/wangchanberta-base-att-spm-uncased", model_kwargs={"temperature": 0}),
55
+ memory=memory
56
+ )
57
+
58
+ # หน้าต่างสำหรับใส่คำถาม
59
+ user_question = st.text_input("Ask a question about the PDF content")
60
+
61
+ if user_question:
62
+ response = qa_chain.run(user_question)
63
+ st.write("Answer:", response)
64
+