Spaces:

SmartPy
/

ScisummNet

Runtime error

App Files Files Community

SmartPy commited on Dec 27, 2022

Commit

dcfa2ec

1 Parent(s): c1fb6c4

Upload 6 files

Browse files

Files changed (6) hide show

app.py +280 -0
example.txt +1 -0
requirements.txt +9 -0
summarize.py +144 -0
textrank.py +41 -0
utils.py +121 -0

app.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import os
+import contextlib
+import logging
+import random
+import re
+import time
+from pathlib import Path
+import gradio as gr
+import nltk
+from cleantext import clean
+from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
+from utils import load_example_filenames, truncate_word_count, saves_summary
+from textrank import get_summary
+example_path = "/content/drive/MyDrive/space/"
+nltk.download("stopwords")  # TODO=find where this requirement originates from
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+def proc_submission(
+    input_text: str,
+    model_size: str,
+    num_beams,
+    token_batch_length,
+    length_penalty,
+    repetition_penalty,
+    no_repeat_ngram_size,
+    max_input_length: int = 1024,
+):
+    settings = {
+        "length_penalty": float(length_penalty),
+        "repetition_penalty": float(repetition_penalty),
+        "no_repeat_ngram_size": int(no_repeat_ngram_size),
+        "encoder_no_repeat_ngram_size": 4,
+        "num_beams": int(num_beams),
+        "min_length": 4,
+        "max_length": int(token_batch_length // 4),
+        "early_stopping": True,
+        "do_sample": False,
+    }
+    st = time.perf_counter()
+    history = {}
+    clean_text = clean(input_text, lower=False)
+    max_input_length = 1024 if "base" in model_size.lower() else max_input_length
+    clean_text = get_summary(clean_text)
+    processed = truncate_word_count(clean_text, max_input_length)
+    if processed["was_truncated"]:
+        tr_in = processed["truncated_text"]
+        # create elaborate HTML warning
+        input_wc = re.split(r"\s+", input_text)
+        msg = f"""
+        <div style="background-color: #FFA500; color: white; padding: 20px;">
+        <h3>Warning</h3>
+        <p>Input text was truncated to {max_input_length} words. That's about {100*max_input_length/len(input_wc):.2f}% of the submission.</p>
+        </div>
+        """
+        logging.warning(msg)
+        history["WARNING"] = msg
+    else:
+        tr_in = input_text
+        msg = None
+    if len(input_text) < 50:
+        # this is essentially a different case from the above
+        msg = f"""
+        <div style="background-color: #880808; color: white; padding: 20px;">
+        <h3>Warning</h3>
+        <p>Input text is too short to summarize. Detected {len(input_text)} characters.
+        Please load text by selecting an example from the dropdown menu or by pasting text into the text box.</p>
+        </div>
+        """
+        logging.warning(msg)
+        logging.warning("RETURNING EMPTY STRING")
+        history["WARNING"] = msg
+        return msg, "", []
+    _summaries = summarize_via_tokenbatches(
+        tr_in,
+        model,
+        tokenizer,
+        batch_length=token_batch_length,
+        **settings,
+    )
+    sum_text = [f"Section {i}: " + s["summary"][0] for i, s in enumerate(_summaries)]
+    sum_scores = [
+        f" - Section {i}: {round(s['summary_score'],4)}"
+        for i, s in enumerate(_summaries)
+    ]
+    sum_text_out = "\n".join(sum_text)
+    history["Summary Scores"] = "<br><br>"
+    scores_out = "\n".join(sum_scores)
+    rt = round((time.perf_counter() - st) / 60, 2)
+    print(f"Runtime: {rt} minutes")
+    html = ""
+    html += f"<p>Runtime: {rt} minutes on CPU</p>"
+    if msg is not None:
+        html += msg
+    html += ""
+    # save to file
+    saved_file = saves_summary(_summaries)
+    return html, sum_text_out, scores_out, saved_file
+def load_single_example_text(
+    example_path: str or Path="/content/example.txt",
+    max_pages=20,
+):
+    """
+    load_single_example - a helper function for the gradio module to load examples
+    Returns:
+        list of str, the examples
+    """
+    global name_to_path
+    full_ex_path = name_to_path[example_path]
+    full_ex_path = Path(full_ex_path)
+    if full_ex_path.suffix == ".txt":
+        with open(full_ex_path, "r", encoding="utf-8", errors="ignore") as f:
+            raw_text = f.read()
+        text = clean(raw_text, lower=False)
+    else:
+        logging.error(f"Unknown file type {full_ex_path.suffix}")
+        text = "ERROR - check example path"
+    return text
+if __name__ == "__main__":
+    logging.info("Starting app instance")
+    os.environ[
+        "TOKENIZERS_PARALLELISM"
+    ] = "false"  # parallelism on tokenizers is buggy with gradio
+    logging.info("Loading summ models")
+    with contextlib.redirect_stdout(None):
+        model, tokenizer = load_model_and_tokenizer(
+            "SmartPy/bart-large-cnn-finetuned-scientific_summarize"
+        )
+    name_to_path = load_example_filenames(example_path)
+    logging.info(f"Loaded {len(name_to_path)} examples")
+    demo = gr.Blocks()
+    _examples = list(name_to_path.keys())
+    with demo:
+        gr.Markdown("# Document Summarization with Long-Document Transformers")
+        gr.Markdown(
+            "This is an example use case for fine-tuned long document transformers. The model is trained on Scientific Article summaries (via the Yale Scientific Article Summarization Dataset). The models in this demo are [Bart-large-cnn](https://huggingface.co/facebook/bart-large-cnn)."
+        )
+        with gr.Column():
+            gr.Markdown("## Load Inputs & Select Parameters")
+            gr.Markdown(
+                "Enter text below in the text area. The text will be summarized [using the selected parameters](https://huggingface.co/blog/how-to-generate). "
+            )
+            with gr.Row(variant="compact"):
+                with gr.Column(scale=0.5, variant="compact"):
+                    model_size = gr.Radio(
+                        choices=["bart-large-cnn"],
+                        label="Model Variant",
+                        value="bart-large-cnn",
+                    )
+                    num_beams = gr.Radio(
+                        choices=[2, 3, 4],
+                        label="Beam Search: # of Beams",
+                        value=2,
+                    )
+                with gr.Column(variant="compact"):
+                    example_name = gr.Dropdown(
+                        _examples,
+                        label="Examples",
+                        value=random.choice(_examples),
+                    )
+            with gr.Row():
+                input_text = gr.Textbox(
+                    lines=4,
+                    label="Input Text (for summarization)",
+                    placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
+                )
+                with gr.Column(min_width=100, scale=0.5):
+                    load_examples_button = gr.Button(
+                        "Load Example",
+                    )
+        with gr.Column():
+            gr.Markdown("## Generate Summary")
+            gr.Markdown(
+                "Summarization should take ~1-2 minutes for most settings, but may extend up to 5-10 minutes in some scenarios."
+            )
+            summarize_button = gr.Button(
+                "Summarize!",
+                variant="primary",
+            )
+            output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
+            gr.Markdown("### Summary Output")
+            summary_text = gr.Textbox(
+                label="Summary", placeholder="The generated summary will appear here"
+            )
+            gr.Markdown(
+                "The summary scores can be thought of as representing the quality of the summary. less-negative numbers (closer to 0) are better:"
+            )
+            summary_scores = gr.Textbox(
+                label="Summary Scores", placeholder="Summary scores will appear here"
+            )
+            text_file = gr.File(
+                label="Download Summary as Text File",
+                file_count="single",
+                type="file",
+                interactive=False,
+            )
+        gr.Markdown("---")
+        with gr.Column():
+            gr.Markdown("### Advanced Settings")
+            with gr.Row(variant="compact"):
+                length_penalty = gr.inputs.Slider(
+                    minimum=0.5,
+                    maximum=1.0,
+                    label="length penalty",
+                    default=0.7,
+                    step=0.05,
+                )
+                token_batch_length = gr.Radio(
+                    choices=[512, 768, 1024, 1536],
+                    label="token batch length",
+                    value=1024,
+                )
+            with gr.Row(variant="compact"):
+                repetition_penalty = gr.inputs.Slider(
+                    minimum=1.0,
+                    maximum=5.0,
+                    label="repetition penalty",
+                    default=3.5,
+                    step=0.1,
+                )
+                no_repeat_ngram_size = gr.Radio(
+                    choices=[2, 3, 4],
+                    label="no repeat ngram size",
+                    value=3,
+                )
+        with gr.Column():
+            gr.Markdown("### About the Model")
+            gr.Markdown(
+                "These models are fine-tuned on the [1000 most cited papers in the ACL Anthology Network (AAN)](http://arxiv.org/pdf/1909.01716.pdf).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
+            )
+            gr.Markdown("---")
+        load_examples_button.click(
+            fn=load_single_example_text, inputs=[example_name], outputs=[input_text]
+        )
+        summarize_button.click(
+            fn=proc_submission,
+            inputs=[
+                input_text,
+                model_size,
+                num_beams,
+                token_batch_length,
+                length_penalty,
+                repetition_penalty,
+                no_repeat_ngram_size,
+            ],
+            outputs=[output_text, summary_text, summary_scores, text_file],
+        )
+    demo.launch(enable_queue=True, debug=True)

example.txt ADDED Viewed

	@@ -0,0 +1 @@

+ Human evaluation machine translation ( MT ) weigh many aspect translation , include adequacy , fidelity , fluency translation ( Hovy , 1999 ; White O ’ Connell , 1994 ) . A comprehensive catalog MT evaluation technique rich literature give Reeder ( 2001 ) . For part , various human evaluation approach quite expensive ( Hovy , 1999 ) . Moreover , take week month finish . This big problem developer machine translation system need monitor effect daily change system order weed bad idea good ideas . We believe MT progress stem evaluation logjam fruitful research idea wait release 1So call method bilingual evaluation understudy , BLEU . evaluation bottleneck . Developers would benefit inexpensive automatic evaluation quick , language-independent , correlate highly human evaluation . We propose evaluation method paper . How one measure translation performance ? The close machine translation professional human translation , good . This central idea behind proposal . To judge quality machine translation , one measure closeness one reference human translation accord numerical metric . Thus , MT evaluation system require two ingredients : We fashion closeness metric highly successful word error rate metric use speech recognition community , appropriately modify multiple reference translation allow legitimate difference word choice word order . The main idea use weighted average variable length phrase match reference translations . This view give rise family metric use various weight schemes . We select promising baseline metric family . In Section 2 , describe baseline metric detail . In Section 3 , evaluate performance BLEU . In Section 4 , describe human evaluation experiment . In Section 5 , compare baseline metric performance human evaluations . Typically , many “ perfect ” translation give source sentence . These translation may vary word choice word order even use words . And yet human clearly distinguish good translation bad one . For example , consider two candidate translation forever hear activity guidebook party direct . Although appear subject , differ markedly quality . For comparison , provide three reference human translation sentence . guarantee military force always command Party . Reference 3 : It practical guide army always heed direction party . It clear good translation , Candidate 1 , share many word phrase three reference translations , Candidate 2 . We shortly quantify notion share Section 2 . 1 . But first observe Candidate 1 share & quot ; It guide action & quot ; Reference 1 , & quot ; & quot ; Reference 2 , & quot ; ensures military & quot ; Reference 1 , & quot ; always & quot ; References 2 3 , & quot ; commands & quot ; Reference 1 , finally & quot ; party & quot ; Reference 2 ( ignore capitalization ) . In contrast , Candidate 2 exhibit far matches , extent less . It clear program rank Candidate 1 high Candidate 2 simply compare ngram match candidate translation reference translations . Experiments large collection translation present Section 5 show ranking ability general phenomenon , artifact toy examples . The primary programming task BLEU implementor compare n-grams candidate n-grams reference translation count number matches . These match positionindependent . The matches , good candidate translation . For simplicity , first focus compute unigram matches . The cornerstone metric familiar precision measure . To compute precision , one simply count number candidate translation word ( unigrams ) occur reference translation divide total number word candidate translation . Unfortunately , MT system overgenerate “ reasonable ” words , result improbable , high-precision , translation like example 2 . Intuitively problem clear : reference word consider exhaust matching candidate word identified . We formalize intuition modified unigram precision . To compute , one first count maximum number time word occur single reference translation . Next , one clip total count candidate word maximum reference count , 2adds clip count , divide total ( unclipped ) number candidate words . In Example 1 , Candidate 1 achieve modified unigram precision 17/18 ; whereas Candidate 2 achieve modified unigram precision 8/14 . Similarly , modified unigram precision Example 2 2/7 , even though standard unigram precision 7/7 . Modified n-gram precision compute similarly n : candidate n-gram count corresponding maximum reference count collected . The candidate count clip corresponding reference maximum value , summed , divide total number candidate ngrams . In Example 1 , Candidate 1 achieve modified bigram precision 10/17 , whereas low quality Candidate 2 achieve modified bigram precision 1/13 . In Example 2 , ( implausible ) candidate achieve modified bigram precision 0 . This sort modified n-gram precision score capture two aspect translation : adequacy fluency . A translation use word ( 1-grams ) reference tend satisfy adequacy . The longer n-gram match account fluency . 4 2 . 1 . 1 Modified n-gram precision block text How compute modify n-gram precision multi-sentence test set ? Although one typically evaluate MT system corpus entire documents , basic unit evaluation sentence . A source sentence may translate many target sentences , case abuse terminology refer corresponding target sentence “ sentence . ” We first compute n-gram match sentence sentence . Next , add clipped n-gram count candidate sentence divide number candidate n-grams test corpus compute modified precision score , pn , entire test corpus . 4BLEU need match human judgment average test corpus ; score individual sentence often vary human judgments . For example , system produce fluent phrase “ East Asian economy ” penalize heavily longer n-gram precision reference happen read “ economy East Asia . ” The key BLEU ’ success system treat similarly multiple human translator different style used , effect cancel comparison systems . 2 . 1 . 2 Ranking system use modify n-gram precision To verify modify n-gram precision distinguishes good translation bad translations , compute modified precision number output ( good ) human translator standard ( poor ) machine translation system use 4 reference translation 127 source sentences . The average precision result show Figure 1 . The strong signal differentiate human ( high precision ) machine ( low precision ) striking . The difference becomes strong go unigram precision 4-gram precision . It appear single n-gram precision score distinguish good translation bad translation . To useful , however , metric must also reliably distinguish translation differ greatly quality . Furthermore , must distinguish two human translation differ quality . This latter requirement ensure continued validity metric MT approach human translation quality . To end , obtain human translation someone lack native proficiency source ( Chinese ) target language ( English ) . For comparison , acquire human translation document native English speaker . We also obtain machine translation three commercial systems . These five “ systems ” — two human three machine — score two reference professional human translations . The average modified n-gram precision result show Figure 2 . Each n-gram statistic imply Phrase ( n -gram ) Length ranking : H2 ( Human-2 ) good H1 ( Human1 ) , big drop quality H1 S3 ( Machine/System-3 ) . S3 appear good S2 turn appear good S1 . Remarkably , rank order assign “ systems ” human judges , discuss later . While seem ample signal single n-gram precision , robust combine signal single number metric . 2 . 1 . 3 Combining modify n-gram precision How combine modified precision various n-gram sizes ? A weight linear average modified precision result encouraging result 5 systems . However , see Figure 2 , modify n-gram precision decay roughly exponentially n : modified unigram precision much large modified bigram precision turn much big modified trigram precision . A reasonable averaging scheme must take exponential decay account ; weighted average logarithm modified precision satisifies requirement . BLEU use average logarithm uniform weights , equivalent use geometric mean modified n-gram precisions . 5 , 6 Experimentally , obtain best correlation monolingual human judgment use maximum n-gram order 4 , although 3-grams 5-grams give comparable results . A candidate translation neither long short , evaluation metric enforce . To extent , n-gram precision already accomplish . N-gram precision penalize spurious word candidate appear reference translations . Additionally , modify precision penalize word occur frequently candidate translation maximum reference count . This reward use word many time warranted penalize use word time occur references . However , modify n-gram precision alone fail enforce proper translation length , illustrate short , absurd example . Because candidate short compare proper length , one expect find inflated precisions : modified unigram precision 2/2 , modified bigram precision 1/1 . Traditionally , precision pair recall overcome length-related problems . However , BLEU consider multiple reference translations , may use different word choice translate source word . Furthermore , good candidate translation use ( recall ) one possible choices , . Indeed , recall choice lead bad translation . Here example . The first candidate recall word references , obviously poor translation second candidate . Thus , naive recall compute set reference word good measure . Admittedly , one could align reference translation discover synonymous word compute recall concept rather words . But , give reference translation vary length differ word order syntax , computation complicated . Candidate translation longer reference already penalize modified n-gram precision measure : need penalize . Consequently , introduce multiplicative brevity penalty factor . With brevity penalty place , high-scoring candidate translation must match reference translation length , word choice , word order . Note neither brevity penalty modified n-gram precision length effect directly consider source length ; instead , consider range reference translation length target language . We wish make brevity penalty 1 . 0 candidate ’ length reference translation ’ length . For example , three reference lengths 12 , 15 , 17 word candidate translation terse 12 words , want brevity penalty 1 . We call close reference sentence length “ best match length . ” One consideration remains : compute brevity penalty sentence sentence average penalties , length deviation short sentence would punish harshly . Instead , compute brevity penalty entire corpus allow freedom sentence level . We first compute test corpus ’ effective reference length , r , sum best match length candidate sentence corpus . We choose brevity penalty decay exponential r/c , c total length candidate translation corpus . We take geometric mean test corpus ’ modify precision score multiply result exponential brevity penalty factor . Currently , case folding text normalization perform compute precision . We first compute geometric average modified n-gram precisions , pn , use n-grams length N positive weight wn sum one . Next , let c length candidate translation r effective reference corpus length . We compute brevity penalty BP , The ranking behavior immediately apparent log domain , log BLEU = min ( 1 − In baseline , use N = 4 uniform weight wn = 1/N . The BLEU metric range 0 1 . Few translation attain score 1 unless identical reference translation . For reason , even human translator necessarily score 1 . It important note reference translation per sentence , high score . Thus , one must cautious make even “ rough ” comparison evaluation different number reference translations : test corpus 500 sentence ( 40 general news stories ) , human translator score 0 . 3468 four reference score 0 . 2571 two references . Table 1 show BLEU score 5 system two reference test corpus . The MT system S2 S3 close metric . Hence , several question arise : To answer questions , divide test corpus 20 block 25 sentence , compute BLEU metric block individually . We thus 20 sample BLEU metric system . We compute means , variances , pair t-statistics display Table 2 . The t-statistic compare system left neighbor table . For example , = 6 pair S1 S2 . Note number Table 1 BLEU metric aggregate 500 sentences , mean Table 2 average BLEU metric aggregate 25 sentences . As expected , two set result close system differ small finite block size effects . Since paired t-statistic 1 . 7 95 % significant , difference systems ’ score statistically significant . The report variance 25-sentence block serve upper bound variance sizeable test set like 500 sentence corpus . How many reference translation need ? We simulate single-reference test corpus randomly select one 4 reference translation single reference 40 stories . In way , ensure degree stylistic variation . The system maintain rank order multiple references . This outcome suggest may use big test corpus single reference translation , provide translation translator . We two group human judges . The first group , call monolingual group , consist 10 native speaker English . The second group , call bilingual group , consist 10 native speaker Chinese live United States past several years . None human judge professional translator . The human judge 5 standard system Chinese sentence subset extract random 500 sentence test corpus . We pair source sentence 5 translations , total 250 pair Chinese source English translations . We prepare web page translation pair randomly order disperse five translation source sentence . All judge use webpage saw sentence pair order . They rat translation 1 ( bad ) 5 ( good ) . The monolingual group make judgment base translations ’ readability fluency . As must expected , judge liberal others . And sentence easy translate others . To account intrinsic difference judge sentences , compare judge ’ rating sentence across systems . We perform four pairwise t-test comparison adjacent system order aggregate average score . Figure 3 show mean difference score two consecutive system 95 % confidence interval mean . We see S2 quite bit well S1 ( mean opinion score difference 0 . 326 5-point scale ) , S3 judge little good ( 0 . 114 ) . Both difference significant 95 % level . 7 The human H1 much good best system , though bit bad human H2 . This surprising give H1 native speaker either Chinese English , whereas H2 native English speaker . Again , difference human translator significant beyond 95 % level . 5 BLEU vs The Human Evaluation Figure 5 show linear regression monolingual group score function BLEU score two reference translation 5 systems . The high correlation coefficient 0 . 99 indicates BLEU track human judgment well . Particularly interesting well BLEU distinguishes S2 S3 quite close . Figure 6 show comparable regression result bilingual group . The correlation coefficient 0 . 96 . We take bad system reference point compare BLEU score human judgment score remain system relative bad system . We take BLEU , monolingual group , bilingual group score 5 system linearly normalize corresponding range ( maximum minimum score across 5 systems ) . The normalized score show Figure 7 . This figure illustrate high correlation BLEU score monolingual group . Of particular interest accuracy BLEU ’ estimate small difference S2 S3 large difference S3 H1 . The figure also highlight relatively large gap MT system human translators . 8 In addition , surmise bilingual group forgive judge H1 relative H2 monolingual group find rather large difference fluency translations . We believe BLEU accelerate MT R & D cycle allow researcher rapidly home effective modeling ideas . Our belief reinforce recent statistical analysis BLEU ’ correlation human judgment translation English four quite different language ( Arabic , Chinese , French , Spanish ) represent 3 different language family ( Papineni et al. , 2002 ) ! BLEU ’ strength correlate highly human judg $ Crossing chasm Chinese-English translation appear significant challenge current state-of-the-art systems . ments average individual sentence judgment error test corpus rather attempt divine exact human judgment every sentence : quantity lead quality . Finally , since MT summarization view natural language generation textual context , believe BLEU could adapt evaluate summarization similar NLG tasks . Acknowledgments This work partially support Defense Advanced Research Projects Agency monitor SPAWAR contract No . N66001-99-2-8916 . The view finding contain material author necessarily reflect position policy Government official endorsement inferred . We gratefully acknowledge comment geometric mean John Makhoul BBN discussion George Doddington NIST . We especially wish thank colleague serve monolingual bilingual judge pool perseverance judge output ChineseEnglish MT systems .

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+clean-text[gpl]
+gradio
+nltk
+torch
+tqdm
+transformers
+accelerate
+sentence_transformers
+'networkx<2.7'

summarize.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import logging
+import torch
+from tqdm.auto import tqdm
+from transformers import BartForConditionalGeneration, BartTokenizer
+def load_model_and_tokenizer(model_name):
+    """
+    load_model_and_tokenizer - a function that loads a model and tokenizer from huggingface
+    Args:
+        model_name (str): the name of the model to load
+    Returns:
+        AutoModelForSeq2SeqLM: the model
+        AutoTokenizer: the tokenizer
+    """
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = BartForConditionalGeneration.from_pretrained(
+        model_name,
+        # low_cpu_mem_usage=True,
+        # use_cache=False,
+    ).to(device)
+    tokenizer = BartTokenizer.from_pretrained(model_name)
+    logging.info(f"Loaded model {model_name} to {device}")
+    return model, tokenizer
+def summarize_and_score(
+    ids, mask, model, tokenizer, is_general_attention_model=True, **kwargs
+):
+    """
+    summarize_and_score - given a batch of ids and a mask, return a summary and a score for the summary
+    Args:
+        ids (): the batch of ids
+        mask (): the attention mask for the batch
+        model   (): the model to use for summarization
+        tokenizer (): the tokenizer to use for summarization
+        is_general_attention_model (bool, optional): whether the model is a general attention model. Defaults to True.
+    Returns:
+        str: the summary of the batch
+    """
+    ids = ids[None, :]
+    mask = mask[None, :]
+    input_ids = ids.to("cuda") if torch.cuda.is_available() else ids
+    attention_mask = mask.to("cuda") if torch.cuda.is_available() else mask
+    global_attention_mask = torch.zeros_like(attention_mask)
+    # put global attention on <s> token
+    global_attention_mask[:, 0] = 1
+    if is_general_attention_model:
+        summary_pred_ids = model.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            output_scores=True,
+            return_dict_in_generate=True,
+            **kwargs,
+        )
+    else:
+        summary_pred_ids = model.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            global_attention_mask=global_attention_mask,
+            output_scores=True,
+            return_dict_in_generate=True,
+            **kwargs,
+        )
+    summary = tokenizer.batch_decode(
+        summary_pred_ids.sequences,
+        skip_special_tokens=True,
+        remove_invalid_values=True,
+    )
+    score = round(summary_pred_ids.sequences_scores.cpu().numpy()[0], 4)
+    return summary, score
+def summarize_via_tokenbatches(
+    input_text: str,
+    model,
+    tokenizer,
+    batch_length=2048,
+    batch_stride=16,
+    **kwargs,
+):
+    """
+    summarize_via_tokenbatches - a function that takes a string and returns a summary
+    Args:
+        input_text (str): the text to summarize
+        model (): the model to use for summarizationz
+        tokenizer (): the tokenizer to use for summarization
+        batch_length (int, optional): the length of each batch. Defaults to 2048.
+        batch_stride (int, optional): the stride of each batch. Defaults to 16. The stride is the number of tokens that overlap between batches.
+    Returns:
+        str: the summary
+    """
+    # log all input parameters
+    if batch_length < 512:
+        batch_length = 512
+        print("WARNING: batch_length was set to 512")
+    print(
+        f"input parameters: {kwargs}, batch_length={batch_length}, batch_stride={batch_stride}"
+    )
+    encoded_input = tokenizer(
+        input_text,
+        padding="max_length",
+        truncation=True,
+        max_length=batch_length,
+        stride=batch_stride,
+        return_overflowing_tokens=True,
+        add_special_tokens=False,
+        return_tensors="pt",
+    )
+    in_id_arr, att_arr = encoded_input.input_ids, encoded_input.attention_mask
+    gen_summaries = []
+    pbar = tqdm(total=len(in_id_arr))
+    for _id, _mask in zip(in_id_arr, att_arr):
+        result, score = summarize_and_score(
+            ids=_id,
+            mask=_mask,
+            model=model,
+            tokenizer=tokenizer,
+            **kwargs,
+        )
+        score = round(float(score), 4)
+        _sum = {
+            "input_tokens": _id,
+            "summary": result,
+            "summary_score": score,
+        }
+        gen_summaries.append(_sum)
+        print(f"\t{result[0]}\nScore:\t{score}")
+        pbar.update()
+    pbar.close()
+    return gen_summaries

textrank.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import numpy as np
+import pandas as pd
+import nltk
+nltk.download('punkt') # one time execution
+import re
+import warnings
+warnings.filterwarnings('ignore')
+from sklearn.metrics.pairwise import cosine_similarity
+import networkx as nx
+from tqdm import tqdm
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer('all-mpnet-base-v2')
+model.to('cuda')
+def get_summary(text, num_words: int=1000):
+    sentences = nltk.sent_tokenize(text)
+    embeddings = model.encode(sentences, show_progress_bar=False)
+    try:
+        sim_matrix = cosine_similarity(embeddings)
+    except Exception as e:
+        print(e, type(e))
+        print(embeddings.shape)
+    nx_graph = nx.from_numpy_array(sim_matrix)
+    scores = nx.pagerank(nx_graph)
+    ranked_sentences = sorted(((scores[i],s, i) for i,s in enumerate(sentences)), reverse=True)
+    final_sents = []
+    total_length = 0
+    for score, sents, i in ranked_sentences:
+        total_length += len(sents.split())
+        if total_length < num_words:
+            final_sents.append((score, sents, i))
+        else:
+            break
+    top_k_sents = sorted(final_sents, key=lambda x: x[2])
+    sents = " ".join([s[1] for s in top_k_sents])
+    return sents

utils.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""
+    utils.py - Utility functions for the project.
+"""
+import re
+from pathlib import Path
+from datetime import datetime
+from natsort import natsorted
+import subprocess
+def get_timestamp() -> str:
+    """
+    get_timestamp - get a timestamp for the current time
+    Returns:
+        str, the timestamp
+    """
+    return datetime.now().strftime("%Y%m%d_%H%M%S")
+def truncate_word_count(text, max_words=512):
+    """
+    truncate_word_count - a helper function for the gradio module
+    Parameters
+    ----------
+    text : str, required, the text to be processed
+    max_words : int, optional, the maximum number of words, default=512
+    Returns
+    -------
+    dict, the text and whether it was truncated
+    """
+    # split on whitespace with regex
+    words = re.split(r"\s+", text)
+    processed = {}
+    if len(words) > max_words:
+        processed["was_truncated"] = True
+        processed["truncated_text"] = " ".join(words[:max_words])
+    else:
+        processed["was_truncated"] = False
+        processed["truncated_text"] = text
+    return processed
+def load_examples(src, filetypes=[".txt", ".pdf"]):
+    """
+    load_examples - a helper function for the gradio module to load examples
+    Returns:
+        list of str, the examples
+    """
+    src = Path(src)
+    src.mkdir(exist_ok=True)
+    pdf_url = (
+        "https://www.dropbox.com/s/y92xy7o5qb88yij/all_you_need_is_attention.pdf?dl=1"
+    )
+    subprocess.run(["wget", pdf_url, "-O", src / "all_you_need_is_attention.pdf"])
+    examples = [f for f in src.iterdir() if f.suffix in filetypes]
+    examples = natsorted(examples)
+    # load the examples into a list
+    text_examples = []
+    for example in examples:
+        with open(example, "r") as f:
+            text = f.read()
+            text_examples.append([text, "base", 2, 1024, 0.7, 3.5, 3])
+    return text_examples
+def load_example_filenames(example_path: str or Path):
+    """
+    load_example_filenames - a helper function for the gradio module to load examples
+    Returns:
+        dict, the examples (filename:full path)
+    """
+    example_path = Path(example_path)
+    # load the examples into a list
+    examples = {f.name: f for f in example_path.glob("*.txt")}
+    return examples
+def saves_summary(summarize_output, outpath: str or Path = None, add_signature=True):
+    """
+    saves_summary - save the summary generated from summarize_via_tokenbatches() to a text file
+            _summaries = summarize_via_tokenbatches(
+              text,
+              batch_length=token_batch_length,
+              batch_stride=batch_stride,
+              **settings,
+          )
+    """
+    outpath = (
+        Path.cwd() / f"document_summary_{get_timestamp()}.txt"
+        if outpath is None
+        else Path(outpath)
+    )
+    sum_text = [s["summary"][0] for s in summarize_output]
+    sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
+    scores_text = "\n".join(sum_scores)
+    full_summary = "\n\t".join(sum_text)
+    with open(
+        outpath,
+        "w",
+    ) as fo:
+        if add_signature:
+            fo.write(
+                "Generated with the Document Summarization space :) https://hf.co/spaces/pszemraj/document-summarization\n\n"
+            )
+        fo.writelines(full_summary)
+    with open(
+        outpath,
+        "a",
+    ) as fo:
+        fo.write("\n" * 3)
+        fo.write(f"\n\nSection Scores:\n")
+        fo.writelines(scores_text)
+        fo.write("\n\n---\n")
+    return outpath