update code
Browse files
app.py
CHANGED
@@ -38,9 +38,10 @@ def file_preprocessing(file, skipfirst, skiplast):
|
|
38 |
else:
|
39 |
pages = pages
|
40 |
print("")
|
41 |
-
print("# pages after
|
42 |
print("")
|
43 |
print(pages)
|
|
|
44 |
text_splitter = RecursiveCharacterTextSplitter(
|
45 |
chunk_size=1000, # number of characters
|
46 |
chunk_overlap=100,
|
@@ -49,16 +50,26 @@ def file_preprocessing(file, skipfirst, skiplast):
|
|
49 |
)
|
50 |
# https://dev.to/eteimz/understanding-langchains-recursivecharactertextsplitter-2846
|
51 |
texts = text_splitter.split_documents(pages)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
final_texts = ""
|
53 |
for text in texts:
|
54 |
final_texts = final_texts + text.page_content
|
55 |
return final_texts
|
56 |
|
57 |
|
|
|
58 |
def preproc_count(filepath, skipfirst, skiplast):
|
59 |
input_text = file_preprocessing(filepath, skipfirst, skiplast)
|
60 |
text_length = len(input_text)
|
61 |
-
print("
|
62 |
return input_text, text_length
|
63 |
|
64 |
|
@@ -80,9 +91,10 @@ def llm_pipeline(tokenizer, base_model, input_text, model_source):
|
|
80 |
return summary
|
81 |
|
82 |
|
|
|
83 |
def postproc_count(summary):
|
84 |
text_length = len(summary)
|
85 |
-
print("
|
86 |
return text_length
|
87 |
|
88 |
|
@@ -143,7 +155,6 @@ def main():
|
|
143 |
truncation=True,
|
144 |
legacy=False,
|
145 |
model_max_length=1000,
|
146 |
-
#cache_dir="model_cache"
|
147 |
)
|
148 |
if model_source == "Download model":
|
149 |
base_model = AutoModelForSeq2SeqLM.from_pretrained(
|
@@ -185,13 +196,13 @@ def main():
|
|
185 |
postproc_text_length = postproc_count(summary)
|
186 |
end = time.time()
|
187 |
duration = end - start
|
|
|
188 |
st.info(
|
189 |
"PDF Summary | Number of words: "
|
190 |
f"{postproc_text_length:,}"
|
191 |
+ " | Summarization time: "
|
192 |
f"{duration:.0f}" + " seconds"
|
193 |
)
|
194 |
-
#st.code("\n".join(tw.wrap(summary, width=80)), language='md')
|
195 |
st.success(summary)
|
196 |
|
197 |
|
|
|
38 |
else:
|
39 |
pages = pages
|
40 |
print("")
|
41 |
+
print("# pages after skip(s) ##########")
|
42 |
print("")
|
43 |
print(pages)
|
44 |
+
print("")
|
45 |
text_splitter = RecursiveCharacterTextSplitter(
|
46 |
chunk_size=1000, # number of characters
|
47 |
chunk_overlap=100,
|
|
|
50 |
)
|
51 |
# https://dev.to/eteimz/understanding-langchains-recursivecharactertextsplitter-2846
|
52 |
texts = text_splitter.split_documents(pages)
|
53 |
+
print("Number of tokens:" + str(len(texts)))
|
54 |
+
print("")
|
55 |
+
print("First three tokens:")
|
56 |
+
print(texts[0])
|
57 |
+
print("")
|
58 |
+
print(texts[1])
|
59 |
+
print("")
|
60 |
+
print(texts[2])
|
61 |
+
print("")
|
62 |
final_texts = ""
|
63 |
for text in texts:
|
64 |
final_texts = final_texts + text.page_content
|
65 |
return final_texts
|
66 |
|
67 |
|
68 |
+
# function to count words in the input
|
69 |
def preproc_count(filepath, skipfirst, skiplast):
|
70 |
input_text = file_preprocessing(filepath, skipfirst, skiplast)
|
71 |
text_length = len(input_text)
|
72 |
+
print("Input word count: " f"{text_length:,}")
|
73 |
return input_text, text_length
|
74 |
|
75 |
|
|
|
91 |
return summary
|
92 |
|
93 |
|
94 |
+
# function to count words in the summary
|
95 |
def postproc_count(summary):
|
96 |
text_length = len(summary)
|
97 |
+
print("Summary word count: " f"{text_length:,}")
|
98 |
return text_length
|
99 |
|
100 |
|
|
|
155 |
truncation=True,
|
156 |
legacy=False,
|
157 |
model_max_length=1000,
|
|
|
158 |
)
|
159 |
if model_source == "Download model":
|
160 |
base_model = AutoModelForSeq2SeqLM.from_pretrained(
|
|
|
196 |
postproc_text_length = postproc_count(summary)
|
197 |
end = time.time()
|
198 |
duration = end - start
|
199 |
+
print("Duration: " f"{duration:.0f}" + " seconds")
|
200 |
st.info(
|
201 |
"PDF Summary | Number of words: "
|
202 |
f"{postproc_text_length:,}"
|
203 |
+ " | Summarization time: "
|
204 |
f"{duration:.0f}" + " seconds"
|
205 |
)
|
|
|
206 |
st.success(summary)
|
207 |
|
208 |
|