Spaces:

Gabriel
/

Swe_summarizer

Runtime error

File size: 10,707 Bytes

69c1e4c
f449d52
1d7f828
 
f944b69
 
 
fa9a17e
fe1889a
69c1e4c
8e1af49
 
 
c35169a
f944b69
 
 
 
 
c88e4f0
 
f944b69
 
 
 
 
 
 
 
 
c35169a
f944b69
 
 
 
 
f449d52
f944b69
 
f449d52
 
f944b69
1d7f828
c35169a
1d7f828
 
 
f944b69
 
1d7f828
 
 
c35169a
f944b69
 
c35169a
 
f99e432
c35169a
 
f944b69
c35169a
 
1d7f828
f944b69
d411c20
f944b69
d411c20
f944b69
d411c20
f944b69
d411c20
 
 
 
f944b69
d411c20
 
f944b69
08b0470
d411c20
 
 
08b0470
 
d411c20
 
 
 
 
 
 
 
 
 
 
1d7f828
 
f944b69
1d7f828
f944b69
 
1d7f828
 
3466849
 
 
 
 
 
1d7f828
 
f944b69
 
 
 
1d7f828
 
f944b69
 
 
 
fe1889a
f944b69
3466849
f944b69
 
1d7f828
f944b69
 
 
 
 
795fbf0
e93f995
 
 
f944b69
 
c35169a
f944b69
 
 
 
1d7f828
f944b69
 
 
 
7b0a7b8
c35169a
1d7f828
f944b69
 
 
 
1d7f828
f944b69
c35169a
1d7f828

import gradio as gr
from transformers import pipeline
import pandas as pd
import json
import nltk
from sentence_transformers import SentenceTransformer, util
import numpy as np
from LexRank import *
from text.py import *

nltk.download('punkt')


def lex_rank(in_text, threshold=None , ex_sent=4 ,model_in = 'KBLab/sentence-bert-swedish-cased', language='swedish' ):
    if threshold == 'None':
      threshold=None

    model = SentenceTransformer(model_in)
    #Split the document into sentences
    sentences = nltk.sent_tokenize(in_text, language=language)
    
    #Compute the sentence embeddings
    embeddings = model.encode(sentences, convert_to_tensor=True)
    cos_scores = util.cos_sim(embeddings, embeddings).cpu().numpy()

    #Compute the centrality for each sentence
    centrality_scores = degree_centrality_scores(cos_scores, threshold=threshold)

    most_central_sentence_indices = np.argsort(-centrality_scores)
    sent_list= []
    for idx in most_central_sentence_indices[0:ex_sent]:
        sent_list.append(sentences[idx])
    return ' '.join(sent_list)


def generate(in_text, num_beams, min_len, max_len, model_in):
  print(in_text)
  pipe = pipeline("summarization", model=model_in)
  answer = pipe(in_text, num_beams=num_beams ,min_length=min_len, max_length=max_len)
  print(answer)
  return answer[0]["summary_text"]

  
def update_history(df, in_text, gen_text ,model_in, sum_typ, parameters):
    # get rid of first seed phrase
    new_row = [{"In_text": in_text,
                "Gen_text": gen_text,
                "Sum_type": sum_typ ,
                "Gen_model": model_in,
                "Parameters": json.dumps(parameters)}]
    return pd.concat([df, pd.DataFrame(new_row)]) 
    
def generate_transformer(in_text, num_beams, min_len, max_len, model_in, history):
    gen_text= generate(in_text,num_beams, min_len, max_len, model_in)
    return gen_text, update_history(history, in_text, gen_text, "Abstractive" ,model_in, {"num_beams": num_beams, 
                                                                                          "min_len": min_len,
                                                                                          "max_len": max_len})

def generate_lexrank(in_text, threshold, model_in, ex_sent ,language, history):
    gen_text= lex_rank(in_text, threshold, ex_sent ,model_in, language)
    return gen_text, update_history(history, in_text, gen_text, "Extractive" ,model_in, {"threshold": threshold, 
                                                                                         "Nr_sent": ex_sent,
                                                                                         "language": language})

with gr.Blocks() as demo:
    gr.Markdown("<h1><center> Swedish Summarization Engine! </center></h1>")
    with gr.Accordion("Read here for details about the app", open=False): 
    
        with gr.Tabs():
            with gr.TabItem("The Summarization App"):
                gr.Markdown("""
                <h3>The Summarization Task</h3>
                <p>
                The goal of text summarization is to extract or generate concise and accurate summaries of a given text document while maintaining key information found                             within the original text document. Text summarization methods can either be used as an extractive or abstractive model. An Extractive method does what it sounds like, it concatenates different important sentences or paragraphs without understanding the meaning of those parts. Extractive summarization does not create any new word phrases. For instance,  if you presented a page of text to an extractive model, it would just act as a text “highlighter”. However, Abstractive summarization generates text in a fashion that tries to guess the meaning in a summarised way of the page of text it is presented. It would put words together in a meaningful way and add the most important fact found in the text. 
                bl </p>
                """)              

            with gr.TabItem("The Summarization Engine"):
                gr.Markdown("""
                <h3>Abstractive vs Extractive</h3>
                  <p>
                  Abstractive
The underlying engines for the Abstractive part are transformer based model BART, a sequence-to-sequence model with a bidirectional (BERT-like) encoder and an autoregressive (GPT-like) decoder. The BART-model was pre-trained by KBLab/bart-base-swedish-cased (link) to learn general knowledge about language. Afterwards, the model was further fine-tuned on two labelled datasets that have been open-sourced:
- Gabriel/cnn_daily_swe (link)
- Gabriel/xsum_swe (link)

To see more in depth regarding the training go to link.

The core idea behind the training procedure is sequential adoption through transfer learning, i.e multiple phases for fine-tuning a pretrained model on different datasets. The figure below illustrates how the skill level of the model increases at each step:


The main benefits of transfer learning in general include the saving of resources and improved efficiency when training new models, so feel free to adopt this model for your type of problem! 🤗

Extractive:
The extractive models for this app are using sentence-transformer models, which basically is using a bi-encoder that determines how similar two sentences are. This type of models convert texts into vectors (embedding) that capture semantic information. Additionally, LexRank, an unsupervised graph-based algorithm, is used to determine centrality scores as a post-process step to summarise. The main idea is that sentences "recommend" other similar sentences to the reader. Thus, if one sentence is very similar to many others, it will likely be a sentence of great importance. The importance of this sentence also stems from the importance of the sentences "recommending" it. Thus, to get ranked highly and placed in a summary, a sentence must be similar to many sentences that are in turn also similar to many other sentences. 
                  </p>""")

    with gr.Tabs():
        with gr.TabItem("Abstractive Generation for Summarization"):
            gr.Markdown(
                      """The default parameters for this transformer based model work well to generate summarization.
                         Use this tab to experiment summarization task of text for different types Abstractive models.""")
            with gr.Row():
                with gr.Column(scale=4):
                    text_baseline_transformer= gr.TextArea(label="Input text to summarize", placeholder="Input summarization")
                    
                with gr.Row():
                    transformer_button_clear = gr.Button("Clear", variant='secondary')
                    transformer_button = gr.Button("Summarize!", variant='primary')
                    
                with gr.Column(scale=3):
                    with gr.Row():
                        num_beams = gr.Slider(minimum=2, maximum=10, value=2, step=1, label="Number of Beams")
                        min_len = gr.Slider(minimum=10, maximum=50, value=25, step=5, label="Min length")
                        max_len = gr.Slider(minimum=50, maximum=130, value=120, step=10, label="Max length")
                    model_in = gr.Dropdown(["Gabriel/bart-base-cnn-swe", "Gabriel/bart-base-cnn-xsum-swe", "Gabriel/bart-base-cnn-xsum-wiki-swe"], value="Gabriel/bart-base-cnn-xsum-swe", label="Model")
                    output_basline_transformer = gr.Textbox(label="Output Text")

            with gr.Row():
                with gr.Accordion("Here are some examples you can use:", open=False): 
                    gr.Markdown("<h3>Press one of the test examples below.<h3>")
                    gr.Markdown("NOTE: First time inference for a new model will take time, since a new model has to downloaded before inference.")
                    gr.Examples([[abstractive_example_text_1
                    , 5,25,120, "Gabriel/bart-base-cnn-swe"],
                    [abstractive_example_text_2
                    , 5,25,120, "Gabriel/bart-base-cnn-xsum-swe"]
                    ], [text_baseline_transformer, num_beams, min_len, max_len, model_in])       

        with gr.TabItem("Extractive Ranking Graph for Summarization"):
            gr.Markdown(
                      """Use this tab to experiment summarization task of text with a graph based method (LexRank).""")
            with gr.Row():
                with gr.Column(scale=4):
                    text_extract= gr.TextArea(label="Input text to summarize", placeholder="Input text")
                    with gr.Row():
                        extract_button_clear = gr.Button("Clear", variant='secondary')
                        extract_button = gr.Button("Summarize!", variant='primary')
                with gr.Column(scale=3):
                    with gr.Row():
                        ex_sent =gr.Slider(minimum=1, maximum=7, value=4, step=1, label="Sentences to return")
                        ex_threshold = gr.Dropdown(['None',0.1,0.2,0.3,0.4,0.5], value='None', label="Similar Threshold")
                        ex_language = gr.Dropdown(["swedish","english"], value="swedish", label="Language")
                    model_in_ex = gr.Dropdown(["KBLab/sentence-bert-swedish-cased","sentence-transformers/all-MiniLM-L6-v2"], value="KBLab/sentence-bert-swedish-cased", label="Model")
                    output_extract = gr.Textbox(label="Output Text")

            with gr.Row():
              with gr.Accordion("Here are some examples you can use:", open=False): 
                  gr.Markdown("<h3>Press one of the test examples below.<h3>")
                  gr.Markdown("NOTE: First time inference for a new model will take time, since a new model has to downloaded before inference.")
                  gr.Examples([[extractive_example_text_1
                  , 'None', 4,'swedish', "KBLab/sentence-bert-swedish-cased"]], [text_extract, ex_threshold, ex_sent ,ex_language, model_in_ex])     

    with gr.Box():
        gr.Markdown("<h3> Generation History <h3>")
        # Displays a dataframe with the history of moves generated, with parameters
        history = gr.Dataframe(headers=["In_text", "Gen_text","Sum_type" ,"Gen_model", "Parameters"], overflow_row_behaviour="show_ends", wrap=True)

    transformer_button.click(generate_transformer, inputs=[text_baseline_transformer, num_beams, min_len, max_len, model_in ,history], outputs=[output_basline_transformer , history] )
    extract_button.click(generate_lexrank, inputs=[text_extract, ex_threshold, model_in_ex, ex_sent ,ex_language ,history], outputs=[output_extract , history] )


demo.launch()