init commit
Browse files- README.md +54 -1
- app.py +192 -0
- requirements.txt +16 -0
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 📈
|
4 |
colorFrom: purple
|
5 |
colorTo: purple
|
@@ -10,3 +10,56 @@ pinned: false
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: RASA
|
3 |
emoji: 📈
|
4 |
colorFrom: purple
|
5 |
colorTo: purple
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
13 |
+
|
14 |
+
# RASA: Research Article Summarization App
|
15 |
+
|
16 |
+
## Description
|
17 |
+
|
18 |
+
This application summarizes an uploaded research article PDF using the large language models "LaMini-Flan-T5-77M" or "LaMini-GPT-124M". LaMini-Flan-T5-77M is a fine-tuned version of google/flan-t5-small on LaMini-instruction dataset that contains 2.58M samples for instruction fine-tuning. LaMini-GPT-124M is a fine-tuned version of gpt2 on LaMini-instruction dataset that contains 2.58M samples for instruction fine-tuning.
|
19 |
+
|
20 |
+
https://huggingface.co/MBZUAI/LaMini-Flan-T5-77M
|
21 |
+
|
22 |
+
https://huggingface.co/MBZUAI/LaMini-GPT-124M
|
23 |
+
|
24 |
+
## Table of Contents
|
25 |
+
|
26 |
+
- [Installation](#installation)
|
27 |
+
- [Usage](#usage)
|
28 |
+
- [Credits](#credits)
|
29 |
+
- [License](#license)
|
30 |
+
|
31 |
+
## Installation
|
32 |
+
|
33 |
+
Create a virtual python environment. To install the required python application packages, type "pip install -r requirements.txt" in a terminal window within the virtual python environment.
|
34 |
+
|
35 |
+
## Usage
|
36 |
+
|
37 |
+
To run locally, navigate to the project folder and in a terminal window type "streamlit run app.py".
|
38 |
+
|
39 |
+
## Credits
|
40 |
+
|
41 |
+
Written by Walter Jessen
|
42 |
+
|
43 |
+
Based on https://www.youtube.com/watch?v=GIbar_kZzwk
|
44 |
+
|
45 |
+
## MIT License
|
46 |
+
|
47 |
+
Copyright (c) 2023 Walter Jessen
|
48 |
+
|
49 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
50 |
+
of this software and associated documentation files (the "Software"), to deal
|
51 |
+
in the Software without restriction, including without limitation the rights
|
52 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
53 |
+
copies of the Software, and to permit persons to whom the Software is
|
54 |
+
furnished to do so, subject to the following conditions:
|
55 |
+
|
56 |
+
The above copyright notice and this permission notice shall be included in all
|
57 |
+
copies or substantial portions of the Software.
|
58 |
+
|
59 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
60 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
61 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
62 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
63 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
64 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
65 |
+
SOFTWARE.
|
app.py
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
import base64
|
3 |
+
from langchain.chains.summarize import load_summarize_chain
|
4 |
+
from langchain.docstore.document import Document
|
5 |
+
from langchain.document_loaders.pdf import PyMuPDFLoader
|
6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
+
from PyPDF2 import PdfReader
|
8 |
+
import streamlit as st
|
9 |
+
import torch
|
10 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
|
11 |
+
|
12 |
+
|
13 |
+
# notes
|
14 |
+
# https://huggingface.co/docs/transformers/pad_truncation
|
15 |
+
|
16 |
+
|
17 |
+
# file loader and preprocessor
|
18 |
+
def file_preprocessing(file, skipfirst, skiplast):
|
19 |
+
loader = PyMuPDFLoader(file)
|
20 |
+
pages = loader.load_and_split()
|
21 |
+
print("")
|
22 |
+
print("# pages[0] ##########")
|
23 |
+
print("")
|
24 |
+
print(pages[0])
|
25 |
+
print("")
|
26 |
+
print("# pages ##########")
|
27 |
+
print("")
|
28 |
+
print(pages)
|
29 |
+
# skip page(s)
|
30 |
+
if (skipfirst == 1) & (skiplast == 0):
|
31 |
+
del pages[0]
|
32 |
+
elif (skipfirst == 0) & (skiplast == 1):
|
33 |
+
del pages[-1]
|
34 |
+
elif (skipfirst == 1) & (skiplast == 1):
|
35 |
+
del pages[0]
|
36 |
+
del pages[-1]
|
37 |
+
else:
|
38 |
+
pages = pages
|
39 |
+
print("")
|
40 |
+
print("# pages after loop ##########")
|
41 |
+
print("")
|
42 |
+
print(pages)
|
43 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
44 |
+
chunk_size=1000, # number of characters
|
45 |
+
chunk_overlap=100,
|
46 |
+
length_function=len,
|
47 |
+
separators=["\n\n", "\n", " ", ""], # default list
|
48 |
+
)
|
49 |
+
# https://dev.to/eteimz/understanding-langchains-recursivecharactertextsplitter-2846
|
50 |
+
texts = text_splitter.split_documents(pages)
|
51 |
+
final_texts = ""
|
52 |
+
for text in texts:
|
53 |
+
final_texts = final_texts + text.page_content
|
54 |
+
return final_texts
|
55 |
+
|
56 |
+
|
57 |
+
def preproc_count(filepath, skipfirst, skiplast):
|
58 |
+
input_text = file_preprocessing(filepath, skipfirst, skiplast)
|
59 |
+
text_length = len(input_text)
|
60 |
+
return input_text, text_length
|
61 |
+
|
62 |
+
|
63 |
+
def postproc_count(summary):
|
64 |
+
text_length = len(summary)
|
65 |
+
return text_length
|
66 |
+
|
67 |
+
|
68 |
+
# llm pipeline
|
69 |
+
def llm_pipeline(tokenizer, base_model, input_text):
|
70 |
+
pipe_sum = pipeline(
|
71 |
+
"summarization",
|
72 |
+
model=base_model,
|
73 |
+
tokenizer=tokenizer,
|
74 |
+
max_length=600,
|
75 |
+
min_length=300,
|
76 |
+
truncation=True,
|
77 |
+
)
|
78 |
+
result = pipe_sum(input_text)
|
79 |
+
result = result[0]["summary_text"]
|
80 |
+
return result
|
81 |
+
|
82 |
+
|
83 |
+
@st.cache_data
|
84 |
+
# function to display the PDF
|
85 |
+
def displayPDF(file):
|
86 |
+
with open(file, "rb") as f:
|
87 |
+
base64_pdf = base64.b64encode(f.read()).decode("utf-8")
|
88 |
+
# embed pdf in html
|
89 |
+
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
|
90 |
+
# display file
|
91 |
+
st.markdown(pdf_display, unsafe_allow_html=True)
|
92 |
+
|
93 |
+
|
94 |
+
# streamlit code
|
95 |
+
st.set_page_config(layout="wide")
|
96 |
+
|
97 |
+
|
98 |
+
def main():
|
99 |
+
st.title("RASA: Research Article Summarization App")
|
100 |
+
uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
|
101 |
+
if uploaded_file is not None:
|
102 |
+
st.subheader("Options")
|
103 |
+
col1, col2, col3 = st.columns([1, 1, 2])
|
104 |
+
with col1:
|
105 |
+
model_names = [
|
106 |
+
"T5-Small",
|
107 |
+
"BART",
|
108 |
+
]
|
109 |
+
selected_model = st.radio("Select a model to use:", model_names)
|
110 |
+
if selected_model == "BART":
|
111 |
+
checkpoint = "ccdv/lsg-bart-base-16384-pubmed"
|
112 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
113 |
+
checkpoint,
|
114 |
+
truncation=True,
|
115 |
+
legacy=False,
|
116 |
+
model_max_length=1000,
|
117 |
+
trust_remote_code=True,
|
118 |
+
)
|
119 |
+
base_model = AutoModelForSeq2SeqLM.from_pretrained(
|
120 |
+
checkpoint, torch_dtype=torch.float32, trust_remote_code=True
|
121 |
+
)
|
122 |
+
else: # default Flan T5 small
|
123 |
+
checkpoint = "MBZUAI/LaMini-Flan-T5-77M"
|
124 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
125 |
+
checkpoint,
|
126 |
+
truncation=True,
|
127 |
+
legacy=False,
|
128 |
+
model_max_length=1000,
|
129 |
+
)
|
130 |
+
base_model = AutoModelForSeq2SeqLM.from_pretrained(
|
131 |
+
checkpoint, torch_dtype=torch.float32
|
132 |
+
)
|
133 |
+
with col2:
|
134 |
+
st.write("Skip any pages?")
|
135 |
+
skipfirst = st.checkbox("Skip first page")
|
136 |
+
skiplast = st.checkbox("Skip last page")
|
137 |
+
with col3:
|
138 |
+
st.write("Background information (links open in a new window)")
|
139 |
+
st.write(
|
140 |
+
"Model class: [BART](https://huggingface.co/docs/transformers/main/en/model_doc/bart)"
|
141 |
+
" | Specific model: [MBZUAI/LaMini-Flan-T5-77M](https://huggingface.co/MBZUAI/LaMini-Flan-T5-77M)"
|
142 |
+
)
|
143 |
+
st.write(
|
144 |
+
"Model class: [T5-Small](https://huggingface.co/docs/transformers/main/en/model_doc/t5)"
|
145 |
+
" | Specific model: [ccdv/lsg-bart-base-16384-pubmed](https://huggingface.co/ccdv/lsg-bart-base-16384-pubmed)"
|
146 |
+
)
|
147 |
+
if st.button("Summarize"):
|
148 |
+
col1, col2 = st.columns(2)
|
149 |
+
filepath = "data/" + uploaded_file.name
|
150 |
+
with open(filepath, "wb") as temp_file:
|
151 |
+
temp_file.write(uploaded_file.read())
|
152 |
+
with col1:
|
153 |
+
input_text, text_length = preproc_count(filepath, skipfirst, skiplast)
|
154 |
+
st.info(
|
155 |
+
"Uploaded PDF | Number of words: "
|
156 |
+
f"{text_length:,}"
|
157 |
+
)
|
158 |
+
pdf_viewer = displayPDF(filepath)
|
159 |
+
with col2:
|
160 |
+
with st.spinner("Please wait..."):
|
161 |
+
summary = llm_pipeline(tokenizer, base_model, input_text)
|
162 |
+
text_length = postproc_count(summary)
|
163 |
+
st.info(
|
164 |
+
"PDF Summary | Number of words: "
|
165 |
+
f"{text_length:,}"
|
166 |
+
)
|
167 |
+
st.success(summary)
|
168 |
+
|
169 |
+
|
170 |
+
st.markdown(
|
171 |
+
"""<style>
|
172 |
+
div[class*="stRadio"] > label > div[data-testid="stMarkdownContainer"] > p {
|
173 |
+
font-size: 1rem;
|
174 |
+
font-weight: 400;
|
175 |
+
}
|
176 |
+
div[class*="stMarkdown"] > div[data-testid="stMarkdownContainer"] > p {
|
177 |
+
margin-bottom: -15px;
|
178 |
+
}
|
179 |
+
div[class*="stCheckbox"] > label {
|
180 |
+
margin-bottom: -15px;
|
181 |
+
}
|
182 |
+
body > a {
|
183 |
+
text-decoration: underline;
|
184 |
+
}
|
185 |
+
</style>
|
186 |
+
""",
|
187 |
+
unsafe_allow_html=True,
|
188 |
+
)
|
189 |
+
|
190 |
+
|
191 |
+
if __name__ == "__main__":
|
192 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
sentence_transformers
|
3 |
+
torch
|
4 |
+
sentencepiece
|
5 |
+
transformers==4.34.0
|
6 |
+
accelerate
|
7 |
+
chromadb
|
8 |
+
pypdf
|
9 |
+
tiktoken
|
10 |
+
streamlit
|
11 |
+
fastapi
|
12 |
+
uvicorn
|
13 |
+
python-multipart
|
14 |
+
aiofiles
|
15 |
+
PyPDF2
|
16 |
+
PyMuPDF
|