Spaces:
Sleeping
Sleeping
import gradio as gr | |
import os | |
import lexrank as lr | |
import nltk | |
import metrics | |
def summarize(in_text): | |
if len(in_text)==0: | |
return 'Error: No text provided', None | |
nltk_file = '/home/user/nltk_data/tokenizers/punkt.zip' | |
if os.path.exists(nltk_file): | |
print('nltk punkt file exists in ', nltk_file) | |
else: | |
print("downloading punkt file") | |
nltk.download('punkt') | |
in_longtext = [] | |
# Discard all senteces that have less than 10 words in them | |
in_text_sentenses = in_text.split('.') | |
print(in_text_sentenses) | |
for sen in in_text_sentenses: | |
if len(sen.split()) > 10: | |
in_longtext.append(sen) | |
in_text = '.'.join(in_longtext)+'.' | |
# The size of the summary is limited to 1024 | |
# The Lexrank algorith accepts only sentences as a limit | |
# We start with one sentece and check the token size | |
# Then increase the number of sentences until the tokensize | |
# of the next sentence exceed the limit | |
target_tokens = 1024 | |
in_sents = metrics.num_sentences(in_text) | |
out_text = lr.get_Summary(in_text,1) | |
n_tokens= metrics.num_tokens(out_text) | |
prev_n_tokens=0 | |
for sen in range(2, in_sents): | |
if n_tokens >= target_tokens: | |
n_tokens = prev_n_tokens | |
break | |
else: | |
out_text = lr.get_Summary(in_text,sen) | |
prev_n_tokens = n_tokens | |
n_tokens= metrics.num_tokens(out_text) | |
n_sents = metrics.num_sentences(out_text) | |
n_words = metrics.num_words(out_text) | |
n_chars = metrics.num_chars(out_text) | |
return out_text, n_words, n_sents, n_chars, n_tokens | |
demo = gr.Interface(summarize, | |
inputs=["text"] , | |
outputs=[gr.Textbox(label="Extractive Summary"), | |
gr.Number(label="Number of Words"), | |
gr.Number(label="Number of Sentences"), | |
gr.Number(label="Number of Characters"), | |
gr.Number(label="Number of Tokens")], | |
allow_flagging="never", queue = True) | |
if __name__ == "__main__": | |
demo.launch() | |