Spaces:
Runtime error
Runtime error
Upload 6 files
Browse files- app.py +280 -0
- example.txt +1 -0
- requirements.txt +9 -0
- summarize.py +144 -0
- textrank.py +41 -0
- utils.py +121 -0
app.py
ADDED
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import contextlib
|
3 |
+
import logging
|
4 |
+
import random
|
5 |
+
import re
|
6 |
+
import time
|
7 |
+
from pathlib import Path
|
8 |
+
|
9 |
+
import gradio as gr
|
10 |
+
import nltk
|
11 |
+
from cleantext import clean
|
12 |
+
|
13 |
+
from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
|
14 |
+
from utils import load_example_filenames, truncate_word_count, saves_summary
|
15 |
+
from textrank import get_summary
|
16 |
+
|
17 |
+
example_path = "/content/drive/MyDrive/space/"
|
18 |
+
nltk.download("stopwords") # TODO=find where this requirement originates from
|
19 |
+
|
20 |
+
logging.basicConfig(
|
21 |
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
22 |
+
)
|
23 |
+
|
24 |
+
|
25 |
+
def proc_submission(
|
26 |
+
input_text: str,
|
27 |
+
model_size: str,
|
28 |
+
num_beams,
|
29 |
+
token_batch_length,
|
30 |
+
length_penalty,
|
31 |
+
repetition_penalty,
|
32 |
+
no_repeat_ngram_size,
|
33 |
+
max_input_length: int = 1024,
|
34 |
+
):
|
35 |
+
|
36 |
+
settings = {
|
37 |
+
"length_penalty": float(length_penalty),
|
38 |
+
"repetition_penalty": float(repetition_penalty),
|
39 |
+
"no_repeat_ngram_size": int(no_repeat_ngram_size),
|
40 |
+
"encoder_no_repeat_ngram_size": 4,
|
41 |
+
"num_beams": int(num_beams),
|
42 |
+
"min_length": 4,
|
43 |
+
"max_length": int(token_batch_length // 4),
|
44 |
+
"early_stopping": True,
|
45 |
+
"do_sample": False,
|
46 |
+
}
|
47 |
+
st = time.perf_counter()
|
48 |
+
history = {}
|
49 |
+
clean_text = clean(input_text, lower=False)
|
50 |
+
max_input_length = 1024 if "base" in model_size.lower() else max_input_length
|
51 |
+
clean_text = get_summary(clean_text)
|
52 |
+
processed = truncate_word_count(clean_text, max_input_length)
|
53 |
+
|
54 |
+
if processed["was_truncated"]:
|
55 |
+
tr_in = processed["truncated_text"]
|
56 |
+
# create elaborate HTML warning
|
57 |
+
input_wc = re.split(r"\s+", input_text)
|
58 |
+
msg = f"""
|
59 |
+
<div style="background-color: #FFA500; color: white; padding: 20px;">
|
60 |
+
<h3>Warning</h3>
|
61 |
+
<p>Input text was truncated to {max_input_length} words. That's about {100*max_input_length/len(input_wc):.2f}% of the submission.</p>
|
62 |
+
</div>
|
63 |
+
"""
|
64 |
+
logging.warning(msg)
|
65 |
+
history["WARNING"] = msg
|
66 |
+
else:
|
67 |
+
tr_in = input_text
|
68 |
+
msg = None
|
69 |
+
|
70 |
+
if len(input_text) < 50:
|
71 |
+
# this is essentially a different case from the above
|
72 |
+
msg = f"""
|
73 |
+
<div style="background-color: #880808; color: white; padding: 20px;">
|
74 |
+
<h3>Warning</h3>
|
75 |
+
<p>Input text is too short to summarize. Detected {len(input_text)} characters.
|
76 |
+
Please load text by selecting an example from the dropdown menu or by pasting text into the text box.</p>
|
77 |
+
</div>
|
78 |
+
"""
|
79 |
+
logging.warning(msg)
|
80 |
+
logging.warning("RETURNING EMPTY STRING")
|
81 |
+
history["WARNING"] = msg
|
82 |
+
|
83 |
+
return msg, "", []
|
84 |
+
|
85 |
+
_summaries = summarize_via_tokenbatches(
|
86 |
+
tr_in,
|
87 |
+
model,
|
88 |
+
tokenizer,
|
89 |
+
batch_length=token_batch_length,
|
90 |
+
**settings,
|
91 |
+
)
|
92 |
+
sum_text = [f"Section {i}: " + s["summary"][0] for i, s in enumerate(_summaries)]
|
93 |
+
sum_scores = [
|
94 |
+
f" - Section {i}: {round(s['summary_score'],4)}"
|
95 |
+
for i, s in enumerate(_summaries)
|
96 |
+
]
|
97 |
+
|
98 |
+
sum_text_out = "\n".join(sum_text)
|
99 |
+
history["Summary Scores"] = "<br><br>"
|
100 |
+
scores_out = "\n".join(sum_scores)
|
101 |
+
rt = round((time.perf_counter() - st) / 60, 2)
|
102 |
+
print(f"Runtime: {rt} minutes")
|
103 |
+
html = ""
|
104 |
+
html += f"<p>Runtime: {rt} minutes on CPU</p>"
|
105 |
+
if msg is not None:
|
106 |
+
html += msg
|
107 |
+
|
108 |
+
html += ""
|
109 |
+
|
110 |
+
# save to file
|
111 |
+
saved_file = saves_summary(_summaries)
|
112 |
+
|
113 |
+
return html, sum_text_out, scores_out, saved_file
|
114 |
+
|
115 |
+
|
116 |
+
def load_single_example_text(
|
117 |
+
example_path: str or Path="/content/example.txt",
|
118 |
+
max_pages=20,
|
119 |
+
):
|
120 |
+
"""
|
121 |
+
load_single_example - a helper function for the gradio module to load examples
|
122 |
+
Returns:
|
123 |
+
list of str, the examples
|
124 |
+
"""
|
125 |
+
global name_to_path
|
126 |
+
full_ex_path = name_to_path[example_path]
|
127 |
+
full_ex_path = Path(full_ex_path)
|
128 |
+
if full_ex_path.suffix == ".txt":
|
129 |
+
with open(full_ex_path, "r", encoding="utf-8", errors="ignore") as f:
|
130 |
+
raw_text = f.read()
|
131 |
+
text = clean(raw_text, lower=False)
|
132 |
+
else:
|
133 |
+
logging.error(f"Unknown file type {full_ex_path.suffix}")
|
134 |
+
text = "ERROR - check example path"
|
135 |
+
|
136 |
+
return text
|
137 |
+
|
138 |
+
if __name__ == "__main__":
|
139 |
+
logging.info("Starting app instance")
|
140 |
+
os.environ[
|
141 |
+
"TOKENIZERS_PARALLELISM"
|
142 |
+
] = "false" # parallelism on tokenizers is buggy with gradio
|
143 |
+
logging.info("Loading summ models")
|
144 |
+
with contextlib.redirect_stdout(None):
|
145 |
+
model, tokenizer = load_model_and_tokenizer(
|
146 |
+
"SmartPy/bart-large-cnn-finetuned-scientific_summarize"
|
147 |
+
)
|
148 |
+
|
149 |
+
name_to_path = load_example_filenames(example_path)
|
150 |
+
logging.info(f"Loaded {len(name_to_path)} examples")
|
151 |
+
demo = gr.Blocks()
|
152 |
+
_examples = list(name_to_path.keys())
|
153 |
+
with demo:
|
154 |
+
|
155 |
+
gr.Markdown("# Document Summarization with Long-Document Transformers")
|
156 |
+
gr.Markdown(
|
157 |
+
"This is an example use case for fine-tuned long document transformers. The model is trained on Scientific Article summaries (via the Yale Scientific Article Summarization Dataset). The models in this demo are [Bart-large-cnn](https://huggingface.co/facebook/bart-large-cnn)."
|
158 |
+
)
|
159 |
+
with gr.Column():
|
160 |
+
|
161 |
+
gr.Markdown("## Load Inputs & Select Parameters")
|
162 |
+
gr.Markdown(
|
163 |
+
"Enter text below in the text area. The text will be summarized [using the selected parameters](https://huggingface.co/blog/how-to-generate). "
|
164 |
+
)
|
165 |
+
with gr.Row(variant="compact"):
|
166 |
+
with gr.Column(scale=0.5, variant="compact"):
|
167 |
+
|
168 |
+
model_size = gr.Radio(
|
169 |
+
choices=["bart-large-cnn"],
|
170 |
+
label="Model Variant",
|
171 |
+
value="bart-large-cnn",
|
172 |
+
)
|
173 |
+
num_beams = gr.Radio(
|
174 |
+
choices=[2, 3, 4],
|
175 |
+
label="Beam Search: # of Beams",
|
176 |
+
value=2,
|
177 |
+
)
|
178 |
+
with gr.Column(variant="compact"):
|
179 |
+
example_name = gr.Dropdown(
|
180 |
+
_examples,
|
181 |
+
label="Examples",
|
182 |
+
value=random.choice(_examples),
|
183 |
+
)
|
184 |
+
|
185 |
+
with gr.Row():
|
186 |
+
input_text = gr.Textbox(
|
187 |
+
lines=4,
|
188 |
+
label="Input Text (for summarization)",
|
189 |
+
placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
|
190 |
+
)
|
191 |
+
with gr.Column(min_width=100, scale=0.5):
|
192 |
+
load_examples_button = gr.Button(
|
193 |
+
"Load Example",
|
194 |
+
)
|
195 |
+
|
196 |
+
with gr.Column():
|
197 |
+
gr.Markdown("## Generate Summary")
|
198 |
+
gr.Markdown(
|
199 |
+
"Summarization should take ~1-2 minutes for most settings, but may extend up to 5-10 minutes in some scenarios."
|
200 |
+
)
|
201 |
+
summarize_button = gr.Button(
|
202 |
+
"Summarize!",
|
203 |
+
variant="primary",
|
204 |
+
)
|
205 |
+
|
206 |
+
output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
|
207 |
+
gr.Markdown("### Summary Output")
|
208 |
+
summary_text = gr.Textbox(
|
209 |
+
label="Summary", placeholder="The generated summary will appear here"
|
210 |
+
)
|
211 |
+
gr.Markdown(
|
212 |
+
"The summary scores can be thought of as representing the quality of the summary. less-negative numbers (closer to 0) are better:"
|
213 |
+
)
|
214 |
+
summary_scores = gr.Textbox(
|
215 |
+
label="Summary Scores", placeholder="Summary scores will appear here"
|
216 |
+
)
|
217 |
+
|
218 |
+
text_file = gr.File(
|
219 |
+
label="Download Summary as Text File",
|
220 |
+
file_count="single",
|
221 |
+
type="file",
|
222 |
+
interactive=False,
|
223 |
+
)
|
224 |
+
|
225 |
+
gr.Markdown("---")
|
226 |
+
with gr.Column():
|
227 |
+
gr.Markdown("### Advanced Settings")
|
228 |
+
with gr.Row(variant="compact"):
|
229 |
+
length_penalty = gr.inputs.Slider(
|
230 |
+
minimum=0.5,
|
231 |
+
maximum=1.0,
|
232 |
+
label="length penalty",
|
233 |
+
default=0.7,
|
234 |
+
step=0.05,
|
235 |
+
)
|
236 |
+
token_batch_length = gr.Radio(
|
237 |
+
choices=[512, 768, 1024, 1536],
|
238 |
+
label="token batch length",
|
239 |
+
value=1024,
|
240 |
+
)
|
241 |
+
|
242 |
+
with gr.Row(variant="compact"):
|
243 |
+
repetition_penalty = gr.inputs.Slider(
|
244 |
+
minimum=1.0,
|
245 |
+
maximum=5.0,
|
246 |
+
label="repetition penalty",
|
247 |
+
default=3.5,
|
248 |
+
step=0.1,
|
249 |
+
)
|
250 |
+
no_repeat_ngram_size = gr.Radio(
|
251 |
+
choices=[2, 3, 4],
|
252 |
+
label="no repeat ngram size",
|
253 |
+
value=3,
|
254 |
+
)
|
255 |
+
with gr.Column():
|
256 |
+
gr.Markdown("### About the Model")
|
257 |
+
gr.Markdown(
|
258 |
+
"These models are fine-tuned on the [1000 most cited papers in the ACL Anthology Network (AAN)](http://arxiv.org/pdf/1909.01716.pdf).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
|
259 |
+
)
|
260 |
+
gr.Markdown("---")
|
261 |
+
|
262 |
+
load_examples_button.click(
|
263 |
+
fn=load_single_example_text, inputs=[example_name], outputs=[input_text]
|
264 |
+
)
|
265 |
+
|
266 |
+
summarize_button.click(
|
267 |
+
fn=proc_submission,
|
268 |
+
inputs=[
|
269 |
+
input_text,
|
270 |
+
model_size,
|
271 |
+
num_beams,
|
272 |
+
token_batch_length,
|
273 |
+
length_penalty,
|
274 |
+
repetition_penalty,
|
275 |
+
no_repeat_ngram_size,
|
276 |
+
],
|
277 |
+
outputs=[output_text, summary_text, summary_scores, text_file],
|
278 |
+
)
|
279 |
+
|
280 |
+
demo.launch(enable_queue=True, debug=True)
|
example.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Human evaluation machine translation ( MT ) weigh many aspect translation , include adequacy , fidelity , fluency translation ( Hovy , 1999 ; White O ’ Connell , 1994 ) . A comprehensive catalog MT evaluation technique rich literature give Reeder ( 2001 ) . For part , various human evaluation approach quite expensive ( Hovy , 1999 ) . Moreover , take week month finish . This big problem developer machine translation system need monitor effect daily change system order weed bad idea good ideas . We believe MT progress stem evaluation logjam fruitful research idea wait release 1So call method bilingual evaluation understudy , BLEU . evaluation bottleneck . Developers would benefit inexpensive automatic evaluation quick , language-independent , correlate highly human evaluation . We propose evaluation method paper . How one measure translation performance ? The close machine translation professional human translation , good . This central idea behind proposal . To judge quality machine translation , one measure closeness one reference human translation accord numerical metric . Thus , MT evaluation system require two ingredients : We fashion closeness metric highly successful word error rate metric use speech recognition community , appropriately modify multiple reference translation allow legitimate difference word choice word order . The main idea use weighted average variable length phrase match reference translations . This view give rise family metric use various weight schemes . We select promising baseline metric family . In Section 2 , describe baseline metric detail . In Section 3 , evaluate performance BLEU . In Section 4 , describe human evaluation experiment . In Section 5 , compare baseline metric performance human evaluations . Typically , many “ perfect ” translation give source sentence . These translation may vary word choice word order even use words . And yet human clearly distinguish good translation bad one . For example , consider two candidate translation forever hear activity guidebook party direct . Although appear subject , differ markedly quality . For comparison , provide three reference human translation sentence . guarantee military force always command Party . Reference 3 : It practical guide army always heed direction party . It clear good translation , Candidate 1 , share many word phrase three reference translations , Candidate 2 . We shortly quantify notion share Section 2 . 1 . But first observe Candidate 1 share & quot ; It guide action & quot ; Reference 1 , & quot ; & quot ; Reference 2 , & quot ; ensures military & quot ; Reference 1 , & quot ; always & quot ; References 2 3 , & quot ; commands & quot ; Reference 1 , finally & quot ; party & quot ; Reference 2 ( ignore capitalization ) . In contrast , Candidate 2 exhibit far matches , extent less . It clear program rank Candidate 1 high Candidate 2 simply compare ngram match candidate translation reference translations . Experiments large collection translation present Section 5 show ranking ability general phenomenon , artifact toy examples . The primary programming task BLEU implementor compare n-grams candidate n-grams reference translation count number matches . These match positionindependent . The matches , good candidate translation . For simplicity , first focus compute unigram matches . The cornerstone metric familiar precision measure . To compute precision , one simply count number candidate translation word ( unigrams ) occur reference translation divide total number word candidate translation . Unfortunately , MT system overgenerate “ reasonable ” words , result improbable , high-precision , translation like example 2 . Intuitively problem clear : reference word consider exhaust matching candidate word identified . We formalize intuition modified unigram precision . To compute , one first count maximum number time word occur single reference translation . Next , one clip total count candidate word maximum reference count , 2adds clip count , divide total ( unclipped ) number candidate words . In Example 1 , Candidate 1 achieve modified unigram precision 17/18 ; whereas Candidate 2 achieve modified unigram precision 8/14 . Similarly , modified unigram precision Example 2 2/7 , even though standard unigram precision 7/7 . Modified n-gram precision compute similarly n : candidate n-gram count corresponding maximum reference count collected . The candidate count clip corresponding reference maximum value , summed , divide total number candidate ngrams . In Example 1 , Candidate 1 achieve modified bigram precision 10/17 , whereas low quality Candidate 2 achieve modified bigram precision 1/13 . In Example 2 , ( implausible ) candidate achieve modified bigram precision 0 . This sort modified n-gram precision score capture two aspect translation : adequacy fluency . A translation use word ( 1-grams ) reference tend satisfy adequacy . The longer n-gram match account fluency . 4 2 . 1 . 1 Modified n-gram precision block text How compute modify n-gram precision multi-sentence test set ? Although one typically evaluate MT system corpus entire documents , basic unit evaluation sentence . A source sentence may translate many target sentences , case abuse terminology refer corresponding target sentence “ sentence . ” We first compute n-gram match sentence sentence . Next , add clipped n-gram count candidate sentence divide number candidate n-grams test corpus compute modified precision score , pn , entire test corpus . 4BLEU need match human judgment average test corpus ; score individual sentence often vary human judgments . For example , system produce fluent phrase “ East Asian economy ” penalize heavily longer n-gram precision reference happen read “ economy East Asia . ” The key BLEU ’ success system treat similarly multiple human translator different style used , effect cancel comparison systems . 2 . 1 . 2 Ranking system use modify n-gram precision To verify modify n-gram precision distinguishes good translation bad translations , compute modified precision number output ( good ) human translator standard ( poor ) machine translation system use 4 reference translation 127 source sentences . The average precision result show Figure 1 . The strong signal differentiate human ( high precision ) machine ( low precision ) striking . The difference becomes strong go unigram precision 4-gram precision . It appear single n-gram precision score distinguish good translation bad translation . To useful , however , metric must also reliably distinguish translation differ greatly quality . Furthermore , must distinguish two human translation differ quality . This latter requirement ensure continued validity metric MT approach human translation quality . To end , obtain human translation someone lack native proficiency source ( Chinese ) target language ( English ) . For comparison , acquire human translation document native English speaker . We also obtain machine translation three commercial systems . These five “ systems ” — two human three machine — score two reference professional human translations . The average modified n-gram precision result show Figure 2 . Each n-gram statistic imply Phrase ( n -gram ) Length ranking : H2 ( Human-2 ) good H1 ( Human1 ) , big drop quality H1 S3 ( Machine/System-3 ) . S3 appear good S2 turn appear good S1 . Remarkably , rank order assign “ systems ” human judges , discuss later . While seem ample signal single n-gram precision , robust combine signal single number metric . 2 . 1 . 3 Combining modify n-gram precision How combine modified precision various n-gram sizes ? A weight linear average modified precision result encouraging result 5 systems . However , see Figure 2 , modify n-gram precision decay roughly exponentially n : modified unigram precision much large modified bigram precision turn much big modified trigram precision . A reasonable averaging scheme must take exponential decay account ; weighted average logarithm modified precision satisifies requirement . BLEU use average logarithm uniform weights , equivalent use geometric mean modified n-gram precisions . 5 , 6 Experimentally , obtain best correlation monolingual human judgment use maximum n-gram order 4 , although 3-grams 5-grams give comparable results . A candidate translation neither long short , evaluation metric enforce . To extent , n-gram precision already accomplish . N-gram precision penalize spurious word candidate appear reference translations . Additionally , modify precision penalize word occur frequently candidate translation maximum reference count . This reward use word many time warranted penalize use word time occur references . However , modify n-gram precision alone fail enforce proper translation length , illustrate short , absurd example . Because candidate short compare proper length , one expect find inflated precisions : modified unigram precision 2/2 , modified bigram precision 1/1 . Traditionally , precision pair recall overcome length-related problems . However , BLEU consider multiple reference translations , may use different word choice translate source word . Furthermore , good candidate translation use ( recall ) one possible choices , . Indeed , recall choice lead bad translation . Here example . The first candidate recall word references , obviously poor translation second candidate . Thus , naive recall compute set reference word good measure . Admittedly , one could align reference translation discover synonymous word compute recall concept rather words . But , give reference translation vary length differ word order syntax , computation complicated . Candidate translation longer reference already penalize modified n-gram precision measure : need penalize . Consequently , introduce multiplicative brevity penalty factor . With brevity penalty place , high-scoring candidate translation must match reference translation length , word choice , word order . Note neither brevity penalty modified n-gram precision length effect directly consider source length ; instead , consider range reference translation length target language . We wish make brevity penalty 1 . 0 candidate ’ length reference translation ’ length . For example , three reference lengths 12 , 15 , 17 word candidate translation terse 12 words , want brevity penalty 1 . We call close reference sentence length “ best match length . ” One consideration remains : compute brevity penalty sentence sentence average penalties , length deviation short sentence would punish harshly . Instead , compute brevity penalty entire corpus allow freedom sentence level . We first compute test corpus ’ effective reference length , r , sum best match length candidate sentence corpus . We choose brevity penalty decay exponential r/c , c total length candidate translation corpus . We take geometric mean test corpus ’ modify precision score multiply result exponential brevity penalty factor . Currently , case folding text normalization perform compute precision . We first compute geometric average modified n-gram precisions , pn , use n-grams length N positive weight wn sum one . Next , let c length candidate translation r effective reference corpus length . We compute brevity penalty BP , The ranking behavior immediately apparent log domain , log BLEU = min ( 1 − In baseline , use N = 4 uniform weight wn = 1/N . The BLEU metric range 0 1 . Few translation attain score 1 unless identical reference translation . For reason , even human translator necessarily score 1 . It important note reference translation per sentence , high score . Thus , one must cautious make even “ rough ” comparison evaluation different number reference translations : test corpus 500 sentence ( 40 general news stories ) , human translator score 0 . 3468 four reference score 0 . 2571 two references . Table 1 show BLEU score 5 system two reference test corpus . The MT system S2 S3 close metric . Hence , several question arise : To answer questions , divide test corpus 20 block 25 sentence , compute BLEU metric block individually . We thus 20 sample BLEU metric system . We compute means , variances , pair t-statistics display Table 2 . The t-statistic compare system left neighbor table . For example , = 6 pair S1 S2 . Note number Table 1 BLEU metric aggregate 500 sentences , mean Table 2 average BLEU metric aggregate 25 sentences . As expected , two set result close system differ small finite block size effects . Since paired t-statistic 1 . 7 95 % significant , difference systems ’ score statistically significant . The report variance 25-sentence block serve upper bound variance sizeable test set like 500 sentence corpus . How many reference translation need ? We simulate single-reference test corpus randomly select one 4 reference translation single reference 40 stories . In way , ensure degree stylistic variation . The system maintain rank order multiple references . This outcome suggest may use big test corpus single reference translation , provide translation translator . We two group human judges . The first group , call monolingual group , consist 10 native speaker English . The second group , call bilingual group , consist 10 native speaker Chinese live United States past several years . None human judge professional translator . The human judge 5 standard system Chinese sentence subset extract random 500 sentence test corpus . We pair source sentence 5 translations , total 250 pair Chinese source English translations . We prepare web page translation pair randomly order disperse five translation source sentence . All judge use webpage saw sentence pair order . They rat translation 1 ( bad ) 5 ( good ) . The monolingual group make judgment base translations ’ readability fluency . As must expected , judge liberal others . And sentence easy translate others . To account intrinsic difference judge sentences , compare judge ’ rating sentence across systems . We perform four pairwise t-test comparison adjacent system order aggregate average score . Figure 3 show mean difference score two consecutive system 95 % confidence interval mean . We see S2 quite bit well S1 ( mean opinion score difference 0 . 326 5-point scale ) , S3 judge little good ( 0 . 114 ) . Both difference significant 95 % level . 7 The human H1 much good best system , though bit bad human H2 . This surprising give H1 native speaker either Chinese English , whereas H2 native English speaker . Again , difference human translator significant beyond 95 % level . 5 BLEU vs The Human Evaluation Figure 5 show linear regression monolingual group score function BLEU score two reference translation 5 systems . The high correlation coefficient 0 . 99 indicates BLEU track human judgment well . Particularly interesting well BLEU distinguishes S2 S3 quite close . Figure 6 show comparable regression result bilingual group . The correlation coefficient 0 . 96 . We take bad system reference point compare BLEU score human judgment score remain system relative bad system . We take BLEU , monolingual group , bilingual group score 5 system linearly normalize corresponding range ( maximum minimum score across 5 systems ) . The normalized score show Figure 7 . This figure illustrate high correlation BLEU score monolingual group . Of particular interest accuracy BLEU ’ estimate small difference S2 S3 large difference S3 H1 . The figure also highlight relatively large gap MT system human translators . 8 In addition , surmise bilingual group forgive judge H1 relative H2 monolingual group find rather large difference fluency translations . We believe BLEU accelerate MT R & D cycle allow researcher rapidly home effective modeling ideas . Our belief reinforce recent statistical analysis BLEU ’ correlation human judgment translation English four quite different language ( Arabic , Chinese , French , Spanish ) represent 3 different language family ( Papineni et al. , 2002 ) ! BLEU ’ strength correlate highly human judg $ Crossing chasm Chinese-English translation appear significant challenge current state-of-the-art systems . ments average individual sentence judgment error test corpus rather attempt divine exact human judgment every sentence : quantity lead quality . Finally , since MT summarization view natural language generation textual context , believe BLEU could adapt evaluate summarization similar NLG tasks . Acknowledgments This work partially support Defense Advanced Research Projects Agency monitor SPAWAR contract No . N66001-99-2-8916 . The view finding contain material author necessarily reflect position policy Government official endorsement inferred . We gratefully acknowledge comment geometric mean John Makhoul BBN discussion George Doddington NIST . We especially wish thank colleague serve monolingual bilingual judge pool perseverance judge output ChineseEnglish MT systems .
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
clean-text[gpl]
|
2 |
+
gradio
|
3 |
+
nltk
|
4 |
+
torch
|
5 |
+
tqdm
|
6 |
+
transformers
|
7 |
+
accelerate
|
8 |
+
sentence_transformers
|
9 |
+
'networkx<2.7'
|
summarize.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from tqdm.auto import tqdm
|
5 |
+
from transformers import BartForConditionalGeneration, BartTokenizer
|
6 |
+
|
7 |
+
|
8 |
+
def load_model_and_tokenizer(model_name):
|
9 |
+
"""
|
10 |
+
load_model_and_tokenizer - a function that loads a model and tokenizer from huggingface
|
11 |
+
Args:
|
12 |
+
model_name (str): the name of the model to load
|
13 |
+
Returns:
|
14 |
+
AutoModelForSeq2SeqLM: the model
|
15 |
+
AutoTokenizer: the tokenizer
|
16 |
+
"""
|
17 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
18 |
+
model = BartForConditionalGeneration.from_pretrained(
|
19 |
+
model_name,
|
20 |
+
# low_cpu_mem_usage=True,
|
21 |
+
# use_cache=False,
|
22 |
+
).to(device)
|
23 |
+
tokenizer = BartTokenizer.from_pretrained(model_name)
|
24 |
+
|
25 |
+
logging.info(f"Loaded model {model_name} to {device}")
|
26 |
+
return model, tokenizer
|
27 |
+
|
28 |
+
|
29 |
+
def summarize_and_score(
|
30 |
+
ids, mask, model, tokenizer, is_general_attention_model=True, **kwargs
|
31 |
+
):
|
32 |
+
"""
|
33 |
+
summarize_and_score - given a batch of ids and a mask, return a summary and a score for the summary
|
34 |
+
Args:
|
35 |
+
ids (): the batch of ids
|
36 |
+
mask (): the attention mask for the batch
|
37 |
+
model (): the model to use for summarization
|
38 |
+
tokenizer (): the tokenizer to use for summarization
|
39 |
+
is_general_attention_model (bool, optional): whether the model is a general attention model. Defaults to True.
|
40 |
+
Returns:
|
41 |
+
str: the summary of the batch
|
42 |
+
"""
|
43 |
+
|
44 |
+
ids = ids[None, :]
|
45 |
+
mask = mask[None, :]
|
46 |
+
|
47 |
+
input_ids = ids.to("cuda") if torch.cuda.is_available() else ids
|
48 |
+
attention_mask = mask.to("cuda") if torch.cuda.is_available() else mask
|
49 |
+
|
50 |
+
global_attention_mask = torch.zeros_like(attention_mask)
|
51 |
+
# put global attention on <s> token
|
52 |
+
global_attention_mask[:, 0] = 1
|
53 |
+
|
54 |
+
if is_general_attention_model:
|
55 |
+
summary_pred_ids = model.generate(
|
56 |
+
input_ids,
|
57 |
+
attention_mask=attention_mask,
|
58 |
+
output_scores=True,
|
59 |
+
return_dict_in_generate=True,
|
60 |
+
**kwargs,
|
61 |
+
)
|
62 |
+
else:
|
63 |
+
summary_pred_ids = model.generate(
|
64 |
+
input_ids,
|
65 |
+
attention_mask=attention_mask,
|
66 |
+
global_attention_mask=global_attention_mask,
|
67 |
+
output_scores=True,
|
68 |
+
return_dict_in_generate=True,
|
69 |
+
**kwargs,
|
70 |
+
)
|
71 |
+
summary = tokenizer.batch_decode(
|
72 |
+
summary_pred_ids.sequences,
|
73 |
+
skip_special_tokens=True,
|
74 |
+
remove_invalid_values=True,
|
75 |
+
)
|
76 |
+
score = round(summary_pred_ids.sequences_scores.cpu().numpy()[0], 4)
|
77 |
+
|
78 |
+
return summary, score
|
79 |
+
|
80 |
+
|
81 |
+
def summarize_via_tokenbatches(
|
82 |
+
input_text: str,
|
83 |
+
model,
|
84 |
+
tokenizer,
|
85 |
+
batch_length=2048,
|
86 |
+
batch_stride=16,
|
87 |
+
**kwargs,
|
88 |
+
):
|
89 |
+
"""
|
90 |
+
summarize_via_tokenbatches - a function that takes a string and returns a summary
|
91 |
+
Args:
|
92 |
+
input_text (str): the text to summarize
|
93 |
+
model (): the model to use for summarizationz
|
94 |
+
tokenizer (): the tokenizer to use for summarization
|
95 |
+
batch_length (int, optional): the length of each batch. Defaults to 2048.
|
96 |
+
batch_stride (int, optional): the stride of each batch. Defaults to 16. The stride is the number of tokens that overlap between batches.
|
97 |
+
Returns:
|
98 |
+
str: the summary
|
99 |
+
"""
|
100 |
+
# log all input parameters
|
101 |
+
if batch_length < 512:
|
102 |
+
batch_length = 512
|
103 |
+
print("WARNING: batch_length was set to 512")
|
104 |
+
print(
|
105 |
+
f"input parameters: {kwargs}, batch_length={batch_length}, batch_stride={batch_stride}"
|
106 |
+
)
|
107 |
+
encoded_input = tokenizer(
|
108 |
+
input_text,
|
109 |
+
padding="max_length",
|
110 |
+
truncation=True,
|
111 |
+
max_length=batch_length,
|
112 |
+
stride=batch_stride,
|
113 |
+
return_overflowing_tokens=True,
|
114 |
+
add_special_tokens=False,
|
115 |
+
return_tensors="pt",
|
116 |
+
)
|
117 |
+
|
118 |
+
in_id_arr, att_arr = encoded_input.input_ids, encoded_input.attention_mask
|
119 |
+
gen_summaries = []
|
120 |
+
|
121 |
+
pbar = tqdm(total=len(in_id_arr))
|
122 |
+
|
123 |
+
for _id, _mask in zip(in_id_arr, att_arr):
|
124 |
+
|
125 |
+
result, score = summarize_and_score(
|
126 |
+
ids=_id,
|
127 |
+
mask=_mask,
|
128 |
+
model=model,
|
129 |
+
tokenizer=tokenizer,
|
130 |
+
**kwargs,
|
131 |
+
)
|
132 |
+
score = round(float(score), 4)
|
133 |
+
_sum = {
|
134 |
+
"input_tokens": _id,
|
135 |
+
"summary": result,
|
136 |
+
"summary_score": score,
|
137 |
+
}
|
138 |
+
gen_summaries.append(_sum)
|
139 |
+
print(f"\t{result[0]}\nScore:\t{score}")
|
140 |
+
pbar.update()
|
141 |
+
|
142 |
+
pbar.close()
|
143 |
+
|
144 |
+
return gen_summaries
|
textrank.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import nltk
|
4 |
+
nltk.download('punkt') # one time execution
|
5 |
+
import re
|
6 |
+
import warnings
|
7 |
+
warnings.filterwarnings('ignore')
|
8 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
+
import networkx as nx
|
10 |
+
from tqdm import tqdm
|
11 |
+
|
12 |
+
from sentence_transformers import SentenceTransformer
|
13 |
+
|
14 |
+
model = SentenceTransformer('all-mpnet-base-v2')
|
15 |
+
|
16 |
+
model.to('cuda')
|
17 |
+
def get_summary(text, num_words: int=1000):
|
18 |
+
sentences = nltk.sent_tokenize(text)
|
19 |
+
embeddings = model.encode(sentences, show_progress_bar=False)
|
20 |
+
try:
|
21 |
+
sim_matrix = cosine_similarity(embeddings)
|
22 |
+
except Exception as e:
|
23 |
+
print(e, type(e))
|
24 |
+
print(embeddings.shape)
|
25 |
+
nx_graph = nx.from_numpy_array(sim_matrix)
|
26 |
+
scores = nx.pagerank(nx_graph)
|
27 |
+
|
28 |
+
ranked_sentences = sorted(((scores[i],s, i) for i,s in enumerate(sentences)), reverse=True)
|
29 |
+
final_sents = []
|
30 |
+
total_length = 0
|
31 |
+
for score, sents, i in ranked_sentences:
|
32 |
+
total_length += len(sents.split())
|
33 |
+
if total_length < num_words:
|
34 |
+
final_sents.append((score, sents, i))
|
35 |
+
else:
|
36 |
+
break
|
37 |
+
|
38 |
+
top_k_sents = sorted(final_sents, key=lambda x: x[2])
|
39 |
+
sents = " ".join([s[1] for s in top_k_sents])
|
40 |
+
|
41 |
+
return sents
|
utils.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
utils.py - Utility functions for the project.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import re
|
6 |
+
from pathlib import Path
|
7 |
+
from datetime import datetime
|
8 |
+
from natsort import natsorted
|
9 |
+
import subprocess
|
10 |
+
|
11 |
+
|
12 |
+
def get_timestamp() -> str:
|
13 |
+
"""
|
14 |
+
get_timestamp - get a timestamp for the current time
|
15 |
+
Returns:
|
16 |
+
str, the timestamp
|
17 |
+
"""
|
18 |
+
return datetime.now().strftime("%Y%m%d_%H%M%S")
|
19 |
+
|
20 |
+
|
21 |
+
def truncate_word_count(text, max_words=512):
|
22 |
+
"""
|
23 |
+
truncate_word_count - a helper function for the gradio module
|
24 |
+
Parameters
|
25 |
+
----------
|
26 |
+
text : str, required, the text to be processed
|
27 |
+
max_words : int, optional, the maximum number of words, default=512
|
28 |
+
Returns
|
29 |
+
-------
|
30 |
+
dict, the text and whether it was truncated
|
31 |
+
"""
|
32 |
+
# split on whitespace with regex
|
33 |
+
words = re.split(r"\s+", text)
|
34 |
+
processed = {}
|
35 |
+
if len(words) > max_words:
|
36 |
+
processed["was_truncated"] = True
|
37 |
+
processed["truncated_text"] = " ".join(words[:max_words])
|
38 |
+
else:
|
39 |
+
processed["was_truncated"] = False
|
40 |
+
processed["truncated_text"] = text
|
41 |
+
return processed
|
42 |
+
|
43 |
+
|
44 |
+
def load_examples(src, filetypes=[".txt", ".pdf"]):
|
45 |
+
"""
|
46 |
+
load_examples - a helper function for the gradio module to load examples
|
47 |
+
Returns:
|
48 |
+
list of str, the examples
|
49 |
+
"""
|
50 |
+
src = Path(src)
|
51 |
+
src.mkdir(exist_ok=True)
|
52 |
+
|
53 |
+
pdf_url = (
|
54 |
+
"https://www.dropbox.com/s/y92xy7o5qb88yij/all_you_need_is_attention.pdf?dl=1"
|
55 |
+
)
|
56 |
+
subprocess.run(["wget", pdf_url, "-O", src / "all_you_need_is_attention.pdf"])
|
57 |
+
examples = [f for f in src.iterdir() if f.suffix in filetypes]
|
58 |
+
examples = natsorted(examples)
|
59 |
+
# load the examples into a list
|
60 |
+
text_examples = []
|
61 |
+
for example in examples:
|
62 |
+
with open(example, "r") as f:
|
63 |
+
text = f.read()
|
64 |
+
text_examples.append([text, "base", 2, 1024, 0.7, 3.5, 3])
|
65 |
+
|
66 |
+
return text_examples
|
67 |
+
|
68 |
+
|
69 |
+
def load_example_filenames(example_path: str or Path):
|
70 |
+
"""
|
71 |
+
load_example_filenames - a helper function for the gradio module to load examples
|
72 |
+
Returns:
|
73 |
+
dict, the examples (filename:full path)
|
74 |
+
"""
|
75 |
+
example_path = Path(example_path)
|
76 |
+
# load the examples into a list
|
77 |
+
examples = {f.name: f for f in example_path.glob("*.txt")}
|
78 |
+
return examples
|
79 |
+
|
80 |
+
|
81 |
+
def saves_summary(summarize_output, outpath: str or Path = None, add_signature=True):
|
82 |
+
"""
|
83 |
+
saves_summary - save the summary generated from summarize_via_tokenbatches() to a text file
|
84 |
+
_summaries = summarize_via_tokenbatches(
|
85 |
+
text,
|
86 |
+
batch_length=token_batch_length,
|
87 |
+
batch_stride=batch_stride,
|
88 |
+
**settings,
|
89 |
+
)
|
90 |
+
"""
|
91 |
+
|
92 |
+
outpath = (
|
93 |
+
Path.cwd() / f"document_summary_{get_timestamp()}.txt"
|
94 |
+
if outpath is None
|
95 |
+
else Path(outpath)
|
96 |
+
)
|
97 |
+
sum_text = [s["summary"][0] for s in summarize_output]
|
98 |
+
sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
|
99 |
+
scores_text = "\n".join(sum_scores)
|
100 |
+
full_summary = "\n\t".join(sum_text)
|
101 |
+
|
102 |
+
with open(
|
103 |
+
outpath,
|
104 |
+
"w",
|
105 |
+
) as fo:
|
106 |
+
if add_signature:
|
107 |
+
fo.write(
|
108 |
+
"Generated with the Document Summarization space :) https://hf.co/spaces/pszemraj/document-summarization\n\n"
|
109 |
+
)
|
110 |
+
fo.writelines(full_summary)
|
111 |
+
with open(
|
112 |
+
outpath,
|
113 |
+
"a",
|
114 |
+
) as fo:
|
115 |
+
|
116 |
+
fo.write("\n" * 3)
|
117 |
+
fo.write(f"\n\nSection Scores:\n")
|
118 |
+
fo.writelines(scores_text)
|
119 |
+
fo.write("\n\n---\n")
|
120 |
+
|
121 |
+
return outpath
|