update code
Browse files
app.py
CHANGED
@@ -6,21 +6,26 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
6 |
from PyPDF2 import PdfReader
|
7 |
import re
|
8 |
import streamlit as st
|
|
|
9 |
import sys
|
10 |
import time
|
11 |
import torch
|
12 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
|
13 |
from transformers import pipeline
|
14 |
|
15 |
-
#
|
16 |
# https://huggingface.co/docs/transformers/pad_truncation
|
|
|
|
|
17 |
|
18 |
|
19 |
# file loader and preprocessor
|
20 |
-
def file_preprocessing(
|
|
|
|
|
21 |
loader = PyMuPDFLoader(file)
|
22 |
pages = loader.load_and_split()
|
23 |
-
#
|
24 |
if (skipfirst == 1) & (skiplast == 0):
|
25 |
del pages[0]
|
26 |
elif (skipfirst == 0) & (skiplast == 1):
|
@@ -30,104 +35,156 @@ def file_preprocessing(file, skipfirst, skiplast):
|
|
30 |
del pages[-1]
|
31 |
else:
|
32 |
pages = pages
|
33 |
-
|
34 |
-
content = ""
|
35 |
for page in pages:
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
text_splitter = RecursiveCharacterTextSplitter(
|
43 |
-
chunk_size=
|
44 |
-
chunk_overlap=
|
45 |
length_function=len,
|
46 |
-
separators=["\n\n", "\n", " ", ""], #
|
47 |
)
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
print(
|
54 |
-
|
55 |
-
print("")
|
56 |
-
print(texts[2])
|
57 |
-
print("")
|
58 |
-
final_texts = ""
|
59 |
-
for text in texts:
|
60 |
-
final_texts = final_texts + text
|
61 |
-
return texts, final_texts
|
62 |
|
63 |
|
64 |
-
#
|
65 |
-
def
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
formatted_sentences = [sentence.capitalize() for sentence in sentences]
|
77 |
-
return " ".join(formatted_sentences)
|
78 |
|
79 |
|
80 |
-
#
|
81 |
-
def llm_pipeline(
|
82 |
-
|
83 |
-
|
|
|
|
|
84 |
model=base_model,
|
85 |
tokenizer=tokenizer,
|
86 |
-
max_length=300,
|
87 |
-
min_length=200,
|
88 |
truncation=True,
|
89 |
)
|
90 |
print("Model source: %s" % (model_source))
|
91 |
-
print("Summarizing
|
92 |
-
result =
|
|
|
|
|
|
|
|
|
93 |
summary = result[0]["summary_text"]
|
94 |
-
print("Summarization finished\n")
|
95 |
print("Summary text:\n")
|
96 |
print(summary)
|
97 |
-
print("")
|
98 |
return summary
|
99 |
|
100 |
|
101 |
-
#
|
102 |
-
def
|
103 |
text_length = len(re.findall(r"\w+", summary))
|
104 |
-
print("
|
105 |
return text_length
|
106 |
|
107 |
|
108 |
-
#
|
109 |
def clean_summary_text(summary):
|
110 |
-
#
|
111 |
-
|
112 |
-
#
|
113 |
-
|
114 |
-
#
|
115 |
-
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
|
119 |
@st.cache_data(ttl=60 * 60)
|
120 |
-
#
|
121 |
def displayPDF(file):
|
122 |
with open(file, "rb") as f:
|
123 |
base64_pdf = base64.b64encode(f.read()).decode("utf-8")
|
124 |
-
#
|
125 |
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
|
126 |
-
#
|
127 |
st.markdown(pdf_display, unsafe_allow_html=True)
|
128 |
|
129 |
|
130 |
-
#
|
131 |
st.set_page_config(layout="wide")
|
132 |
|
133 |
|
@@ -152,14 +209,15 @@ def main():
|
|
152 |
selected_model = st.radio(
|
153 |
"Select a model to use:",
|
154 |
model_names,
|
155 |
-
help="Defauls to T5-Small; for most articles it summarizes better than BART",
|
156 |
)
|
157 |
if selected_model == "BART":
|
|
|
|
|
158 |
checkpoint = "ccdv/lsg-bart-base-16384-pubmed"
|
159 |
tokenizer = AutoTokenizer.from_pretrained(
|
160 |
checkpoint,
|
161 |
truncation=True,
|
162 |
-
model_max_length=
|
163 |
trust_remote_code=True,
|
164 |
)
|
165 |
if model_source == "Download model":
|
@@ -171,12 +229,14 @@ def main():
|
|
171 |
else:
|
172 |
base_model = "model_cache/models--ccdv--lsg-bart-base-16384-pubmed/snapshots/4072bc1a7a94e2b4fd860a5fdf1b71d0487dcf15"
|
173 |
else:
|
|
|
|
|
174 |
checkpoint = "MBZUAI/LaMini-Flan-T5-77M"
|
175 |
tokenizer = AutoTokenizer.from_pretrained(
|
176 |
checkpoint,
|
177 |
truncation=True,
|
178 |
legacy=False,
|
179 |
-
model_max_length=
|
180 |
)
|
181 |
if model_source == "Download model":
|
182 |
base_model = AutoModelForSeq2SeqLM.from_pretrained(
|
@@ -201,64 +261,142 @@ def main():
|
|
201 |
"Model class: [BART](https://huggingface.co/docs/transformers/main/en/model_doc/bart)"
|
202 |
" | Model: [lsg-bart-base-16384-pubmed](https://huggingface.co/ccdv/lsg-bart-base-16384-pubmed)"
|
203 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
if st.button("Summarize"):
|
205 |
col1, col2 = st.columns(2)
|
206 |
filepath = "data/" + uploaded_file.name
|
207 |
with open(filepath, "wb") as temp_file:
|
208 |
temp_file.write(uploaded_file.read())
|
209 |
with col1:
|
210 |
-
|
211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
)
|
213 |
st.info(
|
214 |
"Uploaded PDF | Number of words: "
|
215 |
-
f"{
|
216 |
)
|
217 |
pdf_viewer = displayPDF(filepath)
|
218 |
with col2:
|
219 |
start = time.time()
|
220 |
with st.spinner("Summarizing..."):
|
221 |
summary = llm_pipeline(
|
222 |
-
tokenizer,
|
|
|
|
|
|
|
|
|
223 |
)
|
224 |
-
|
|
|
225 |
end = time.time()
|
226 |
duration = end - start
|
227 |
print("Duration: " f"{duration:.0f}" + " seconds")
|
228 |
st.info(
|
229 |
"PDF Summary | Number of words: "
|
230 |
-
f"{
|
231 |
+ " | Summarization time: "
|
232 |
f"{duration:.0f}" + " seconds"
|
233 |
)
|
234 |
if selected_model == "BART":
|
|
|
235 |
summary_cleaned = clean_summary_text(summary)
|
236 |
-
|
237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
st.write(summary)
|
239 |
else:
|
240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
col1 = st.columns(1)
|
242 |
url = "https://dev.to/eteimz/understanding-langchains-recursivecharactertextsplitter-2846"
|
243 |
st.info("Additional information")
|
244 |
-
|
245 |
-
|
|
|
246 |
st.write(
|
247 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
)
|
249 |
st.write(
|
250 |
" length_function=len"
|
251 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
st.write("")
|
253 |
-
st.write(
|
254 |
st.write("")
|
255 |
-
st.write(
|
256 |
-
st.write("
|
257 |
-
|
258 |
-
st.write(
|
259 |
-
|
260 |
-
|
261 |
-
st.write(
|
|
|
262 |
|
263 |
|
264 |
st.markdown(
|
@@ -273,6 +411,10 @@ div[class*="stMarkdown"] > div[data-testid="stMarkdownContainer"] > p {
|
|
273 |
div[class*="stCheckbox"] > label[data-baseweb="checkbox"] {
|
274 |
margin-bottom: -15px;
|
275 |
}
|
|
|
|
|
|
|
|
|
276 |
body > a {
|
277 |
text-decoration: underline;
|
278 |
}
|
|
|
6 |
from PyPDF2 import PdfReader
|
7 |
import re
|
8 |
import streamlit as st
|
9 |
+
from streamlit_tags import st_tags
|
10 |
import sys
|
11 |
import time
|
12 |
import torch
|
13 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
|
14 |
from transformers import pipeline
|
15 |
|
16 |
+
# Notes
|
17 |
# https://huggingface.co/docs/transformers/pad_truncation
|
18 |
+
# https://stackoverflow.com/questions/76431655/langchain-pypdfloader
|
19 |
+
# https://dev.to/eteimz/understanding-langchains-recursivecharactertextsplitter-2846
|
20 |
|
21 |
|
22 |
# file loader and preprocessor
|
23 |
+
def file_preprocessing(
|
24 |
+
file, skipfirst, skiplast, chunk_size, chunk_overlap, exclude_words
|
25 |
+
):
|
26 |
loader = PyMuPDFLoader(file)
|
27 |
pages = loader.load_and_split()
|
28 |
+
# Skip user-specified page(s)
|
29 |
if (skipfirst == 1) & (skiplast == 0):
|
30 |
del pages[0]
|
31 |
elif (skipfirst == 0) & (skiplast == 1):
|
|
|
35 |
del pages[-1]
|
36 |
else:
|
37 |
pages = pages
|
38 |
+
input_text = ""
|
|
|
39 |
for page in pages:
|
40 |
+
input_text = input_text + page.page_content
|
41 |
+
input_text = re.sub("-\n", "", input_text)
|
42 |
+
input_text = re.sub(r"\n", " ", input_text)
|
43 |
+
# Initialize a list to store valid sentences
|
44 |
+
valid_sentences = []
|
45 |
+
# Split the input_text into sentences
|
46 |
+
sentences = re.split(r"(?<=[.!?])\s+", input_text)
|
47 |
+
# Iterate through each sentence
|
48 |
+
for sentence in sentences:
|
49 |
+
# Check if any exclude_word is present in the sentence
|
50 |
+
if any(word in sentence for word in exclude_words):
|
51 |
+
continue # Skip sentences with exclude_words
|
52 |
+
valid_sentences.append(sentence)
|
53 |
+
final_input_text = " ".join(valid_sentences)
|
54 |
+
print("\n############## New article ##############\n")
|
55 |
+
print("Cleaned and formatted input text:\n")
|
56 |
+
print(final_input_text)
|
57 |
+
print("\nExcluded words: " + str(exclude_words))
|
58 |
+
print("\nChunking input text...\n")
|
59 |
text_splitter = RecursiveCharacterTextSplitter(
|
60 |
+
chunk_size=chunk_size, # Number of characters
|
61 |
+
chunk_overlap=chunk_overlap,
|
62 |
length_function=len,
|
63 |
+
separators=["\n\n", "\n", " ", ""], # Default list
|
64 |
)
|
65 |
+
text_chunks = text_splitter.split_text(final_input_text)
|
66 |
+
print("Number of chunks: " + str(len(text_chunks)), end="")
|
67 |
+
chunks = ""
|
68 |
+
for text in text_chunks:
|
69 |
+
chunks = chunks + "\n\n" + text
|
70 |
+
print(chunks)
|
71 |
+
return final_input_text, text_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
|
74 |
+
# Function to count words in the input
|
75 |
+
def preprocessing_word_count(
|
76 |
+
filepath, skipfirst, skiplast, chunk_size, chunk_overlap, exclude_words
|
77 |
+
):
|
78 |
+
final_input_text, text_chunks = file_preprocessing(
|
79 |
+
filepath, skipfirst, skiplast, chunk_size, chunk_overlap, exclude_words
|
80 |
+
)
|
81 |
+
text_length = len(re.findall(r"\w+", final_input_text))
|
82 |
+
print("\nInput word count: " f"{text_length:,}")
|
83 |
+
print("Chunk size: " f"{chunk_size:,}")
|
84 |
+
print("Chunk overlap: %s" % chunk_overlap)
|
85 |
+
return final_input_text, text_chunks, text_length
|
|
|
|
|
86 |
|
87 |
|
88 |
+
# LLM pipeline for summarization
|
89 |
+
def llm_pipeline(
|
90 |
+
tokenizer, base_model, final_input_text, model_source, minimum_token_number
|
91 |
+
):
|
92 |
+
summarizer = pipeline(
|
93 |
+
task="summarization",
|
94 |
model=base_model,
|
95 |
tokenizer=tokenizer,
|
|
|
|
|
96 |
truncation=True,
|
97 |
)
|
98 |
print("Model source: %s" % (model_source))
|
99 |
+
print("Summarizing...\n")
|
100 |
+
result = summarizer(
|
101 |
+
final_input_text,
|
102 |
+
min_length=minimum_token_number,
|
103 |
+
max_length=tokenizer.model_max_length,
|
104 |
+
)
|
105 |
summary = result[0]["summary_text"]
|
|
|
106 |
print("Summary text:\n")
|
107 |
print(summary)
|
|
|
108 |
return summary
|
109 |
|
110 |
|
111 |
+
# Function to count words in the summary
|
112 |
+
def postprocessing_word_count(summary):
|
113 |
text_length = len(re.findall(r"\w+", summary))
|
114 |
+
print("\nSummary word count: " f"{text_length:,}")
|
115 |
return text_length
|
116 |
|
117 |
|
118 |
+
# Function to clean bart summary text
|
119 |
def clean_summary_text(summary):
|
120 |
+
# Remove next line
|
121 |
+
summary_cleaned_1 = re.sub(r"\n\s+", "", summary)
|
122 |
+
# Remove whitespace
|
123 |
+
summary_cleaned_2 = summary_cleaned_1.strip()
|
124 |
+
# Remove any spaces before punctuation (bart)
|
125 |
+
summary_cleaned_3 = re.sub(r"\s+([.,;:)!?](?:\s|$))", r"\1", summary_cleaned_2)
|
126 |
+
# Remove any spaces after "("
|
127 |
+
summary_cleaned_4 = re.sub(r"\(\s", r"(", summary_cleaned_3)
|
128 |
+
# Remove any spaces betweeen the closing parenthesis and other puncuation
|
129 |
+
summary_cleaned_5 = re.sub(r"(\))\s+([,.:;?!])", r"\1\2", summary_cleaned_4)
|
130 |
+
return summary_cleaned_5
|
131 |
+
|
132 |
+
|
133 |
+
# Function to covert bart summary to sentence case
|
134 |
+
def convert_to_sentence_case(summary):
|
135 |
+
# Split the paragraph into sentences based on '.', '!', or '?'
|
136 |
+
sentences = re.split(r"(?<=[.!?])\s+", summary)
|
137 |
+
# Convert to sentence case and join the sentences back together
|
138 |
+
formatted_sentences = [sentence.capitalize() for sentence in sentences]
|
139 |
+
return " ".join(formatted_sentences)
|
140 |
+
|
141 |
+
|
142 |
+
def remove_duplicate_sentences(summary):
|
143 |
+
# Split the paragraph into sentences
|
144 |
+
sentences = re.split(r"(?<=[.!?])\s+", summary)
|
145 |
+
# Initialize a set to store unique sentences
|
146 |
+
unique_sentences = set()
|
147 |
+
# Initialize a list to store valid sentences
|
148 |
+
valid_sentences = []
|
149 |
+
# Iterate through each sentence
|
150 |
+
for sentence in sentences:
|
151 |
+
# Check if the sentence is unique
|
152 |
+
if sentence not in unique_sentences:
|
153 |
+
unique_sentences.add(sentence)
|
154 |
+
valid_sentences.append(sentence)
|
155 |
+
# Join the remaining valid sentences to create the final_summary
|
156 |
+
final_summary = " ".join(valid_sentences)
|
157 |
+
return final_summary
|
158 |
+
|
159 |
+
|
160 |
+
# Function to remove incomplete last sentence from summary
|
161 |
+
def remove_incomplete_last_sentence(summary):
|
162 |
+
# Split the paragraph into sentences based on '.', '!', or '?'
|
163 |
+
sentences = re.split(r"(?<=[.!?])\s+", summary)
|
164 |
+
# Check if the last sentence lacks punctuation at the end
|
165 |
+
if (
|
166 |
+
sentences
|
167 |
+
and sentences[-1].strip()
|
168 |
+
and not sentences[-1].strip().endswith((".", "!", "?"))
|
169 |
+
):
|
170 |
+
# Remove the last sentence from the paragraph
|
171 |
+
sentences.pop()
|
172 |
+
# Join the sentences back together
|
173 |
+
return " ".join(sentences)
|
174 |
|
175 |
|
176 |
@st.cache_data(ttl=60 * 60)
|
177 |
+
# Function to display the PDF
|
178 |
def displayPDF(file):
|
179 |
with open(file, "rb") as f:
|
180 |
base64_pdf = base64.b64encode(f.read()).decode("utf-8")
|
181 |
+
# Embed pdf in html
|
182 |
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
|
183 |
+
# Display file
|
184 |
st.markdown(pdf_display, unsafe_allow_html=True)
|
185 |
|
186 |
|
187 |
+
# Streamlit code
|
188 |
st.set_page_config(layout="wide")
|
189 |
|
190 |
|
|
|
209 |
selected_model = st.radio(
|
210 |
"Select a model to use:",
|
211 |
model_names,
|
|
|
212 |
)
|
213 |
if selected_model == "BART":
|
214 |
+
chunk_size = 800
|
215 |
+
chunk_overlap = 80
|
216 |
checkpoint = "ccdv/lsg-bart-base-16384-pubmed"
|
217 |
tokenizer = AutoTokenizer.from_pretrained(
|
218 |
checkpoint,
|
219 |
truncation=True,
|
220 |
+
model_max_length=512,
|
221 |
trust_remote_code=True,
|
222 |
)
|
223 |
if model_source == "Download model":
|
|
|
229 |
else:
|
230 |
base_model = "model_cache/models--ccdv--lsg-bart-base-16384-pubmed/snapshots/4072bc1a7a94e2b4fd860a5fdf1b71d0487dcf15"
|
231 |
else:
|
232 |
+
chunk_size = 1000
|
233 |
+
chunk_overlap = 100
|
234 |
checkpoint = "MBZUAI/LaMini-Flan-T5-77M"
|
235 |
tokenizer = AutoTokenizer.from_pretrained(
|
236 |
checkpoint,
|
237 |
truncation=True,
|
238 |
legacy=False,
|
239 |
+
model_max_length=512,
|
240 |
)
|
241 |
if model_source == "Download model":
|
242 |
base_model = AutoModelForSeq2SeqLM.from_pretrained(
|
|
|
261 |
"Model class: [BART](https://huggingface.co/docs/transformers/main/en/model_doc/bart)"
|
262 |
" | Model: [lsg-bart-base-16384-pubmed](https://huggingface.co/ccdv/lsg-bart-base-16384-pubmed)"
|
263 |
)
|
264 |
+
exclude_words = st_tags(
|
265 |
+
label="Enter words/phrases to exclude from the summary:",
|
266 |
+
text="Press enter to add words/phrases",
|
267 |
+
)
|
268 |
+
col1, col2, col3 = st.columns([1, 1, 5])
|
269 |
+
with col1:
|
270 |
+
minimum_token_number = st.number_input(
|
271 |
+
"Minimum number of tokens",
|
272 |
+
value=200,
|
273 |
+
step=25,
|
274 |
+
min_value=0,
|
275 |
+
max_value=512,
|
276 |
+
help="Use a larger number of tokens to increase summary length",
|
277 |
+
)
|
278 |
+
with col3:
|
279 |
+
st.subheader("Notes")
|
280 |
+
st.write(
|
281 |
+
"To remove content from the summary, copy and paste the word(s) and/or phrase(s) to exclude into the box above and summarize again."
|
282 |
+
)
|
283 |
+
st.write(
|
284 |
+
"To lengthen or shorten the summary, increase or decrease the minimum number of tokens to the left and summarize again."
|
285 |
+
)
|
286 |
if st.button("Summarize"):
|
287 |
col1, col2 = st.columns(2)
|
288 |
filepath = "data/" + uploaded_file.name
|
289 |
with open(filepath, "wb") as temp_file:
|
290 |
temp_file.write(uploaded_file.read())
|
291 |
with col1:
|
292 |
+
(
|
293 |
+
final_input_text,
|
294 |
+
text_chunks,
|
295 |
+
preprocessing_text_length,
|
296 |
+
) = preprocessing_word_count(
|
297 |
+
filepath,
|
298 |
+
skipfirst,
|
299 |
+
skiplast,
|
300 |
+
chunk_size,
|
301 |
+
chunk_overlap,
|
302 |
+
exclude_words,
|
303 |
)
|
304 |
st.info(
|
305 |
"Uploaded PDF | Number of words: "
|
306 |
+
f"{preprocessing_text_length:,}"
|
307 |
)
|
308 |
pdf_viewer = displayPDF(filepath)
|
309 |
with col2:
|
310 |
start = time.time()
|
311 |
with st.spinner("Summarizing..."):
|
312 |
summary = llm_pipeline(
|
313 |
+
tokenizer,
|
314 |
+
base_model,
|
315 |
+
final_input_text,
|
316 |
+
model_source,
|
317 |
+
minimum_token_number,
|
318 |
)
|
319 |
+
# Count summary words
|
320 |
+
postprocessing_text_length = postprocessing_word_count(summary)
|
321 |
end = time.time()
|
322 |
duration = end - start
|
323 |
print("Duration: " f"{duration:.0f}" + " seconds")
|
324 |
st.info(
|
325 |
"PDF Summary | Number of words: "
|
326 |
+
f"{postprocessing_text_length:,}"
|
327 |
+ " | Summarization time: "
|
328 |
f"{duration:.0f}" + " seconds"
|
329 |
)
|
330 |
if selected_model == "BART":
|
331 |
+
# Use regex to clean the unformatted bart summary
|
332 |
summary_cleaned = clean_summary_text(summary)
|
333 |
+
# Convert to sentence case
|
334 |
+
summary_cleaned_sentence_case = convert_to_sentence_case(
|
335 |
+
summary_cleaned
|
336 |
+
)
|
337 |
+
# Remove duplicate sentences
|
338 |
+
summary_cleaned_sentence_case_dedup = remove_duplicate_sentences(
|
339 |
+
summary_cleaned_sentence_case
|
340 |
+
)
|
341 |
+
# Remove incomplete last sentence
|
342 |
+
summary_cleaned_final = remove_incomplete_last_sentence(
|
343 |
+
summary_cleaned_sentence_case_dedup
|
344 |
+
)
|
345 |
+
st.success(summary_cleaned_final)
|
346 |
+
with st.expander("Unformatted output"):
|
347 |
st.write(summary)
|
348 |
else:
|
349 |
+
# Remove duplicate sentences
|
350 |
+
summary_dedup = remove_duplicate_sentences(summary)
|
351 |
+
# Remove incomplete last sentence
|
352 |
+
summary_final = remove_incomplete_last_sentence(summary_dedup)
|
353 |
+
st.success(summary_final)
|
354 |
+
with st.expander("Unformatted output"):
|
355 |
+
st.write(summary)
|
356 |
col1 = st.columns(1)
|
357 |
url = "https://dev.to/eteimz/understanding-langchains-recursivecharactertextsplitter-2846"
|
358 |
st.info("Additional information")
|
359 |
+
input_ids = tokenizer.encode(
|
360 |
+
final_input_text, add_special_tokens=True, truncation=True
|
361 |
+
)
|
362 |
st.write(
|
363 |
+
"Maximum number of tokens generated for inputs into the model: %s"
|
364 |
+
% f"{len(input_ids):,}"
|
365 |
+
)
|
366 |
+
st.write("First 10 tokens:")
|
367 |
+
first_10_tokens = input_ids[:10]
|
368 |
+
first_10_tokens_text = tokenizer.convert_ids_to_tokens(first_10_tokens)
|
369 |
+
st.write(first_10_tokens_text)
|
370 |
+
|
371 |
+
st.write("[RecursiveCharacterTextSplitter](%s) parameters used:" % url)
|
372 |
+
st.write(
|
373 |
+
" chunk_size=%s"
|
374 |
+
% chunk_size
|
375 |
+
)
|
376 |
+
st.write(
|
377 |
+
" chunk_overlap=%s"
|
378 |
+
% chunk_overlap
|
379 |
)
|
380 |
st.write(
|
381 |
" length_function=len"
|
382 |
)
|
383 |
+
st.write("\n")
|
384 |
+
st.write("Number of input text chunks: " + str(len(text_chunks)))
|
385 |
+
st.write("")
|
386 |
+
st.write("First three chunks:")
|
387 |
+
st.write("\n")
|
388 |
+
st.write(text_chunks[0])
|
389 |
st.write("")
|
390 |
+
st.write(text_chunks[1])
|
391 |
st.write("")
|
392 |
+
st.write(text_chunks[2])
|
393 |
+
st.write("\n")
|
394 |
+
|
395 |
+
st.write(
|
396 |
+
"Extracted and cleaned text, less sentences containing excluded words:"
|
397 |
+
)
|
398 |
+
st.write("")
|
399 |
+
st.write(final_input_text)
|
400 |
|
401 |
|
402 |
st.markdown(
|
|
|
411 |
div[class*="stCheckbox"] > label[data-baseweb="checkbox"] {
|
412 |
margin-bottom: -15px;
|
413 |
}
|
414 |
+
div[class*="stNumberInput"] > label > div[data-testid="stMarkdownContainer"] > p {
|
415 |
+
font-size: 1rem;
|
416 |
+
font-weight: 400;
|
417 |
+
}
|
418 |
body > a {
|
419 |
text-decoration: underline;
|
420 |
}
|