Spaces:

wldmr
/

lexrank-gr

Sleeping

File size: 1,621 Bytes

f7c77c5
 
 
 
 
 
 
 
 
 
fcf9a3b
f7c77c5
fcf9a3b
f7c77c5
fcf9a3b
 
f7c77c5
 
 
 
 
 
 
 
d3d473d
252c8c0
f7c77c5

#import nltk
#nltk.download('punkt')

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

def get_Summary(in_text, nr_sentences):
    
    #sentences = in_text.split('. ')
    # summarize small part of the text
    #nr_sentences = 1 #len(sentences)
    #print('nr_sentences: '+str(nr_sentences))
    
    if nr_sentences == 0:
        return 'Error: No sentences available', None 
    list_summary = get_Lexrank(in_text,nr_sentences)
    # it can happen that for lexrank a sentence consists of multiple actual sentences, 
    # that are separated with full stops. Then the correspoinding timestamp cannot be found
    # all items from the lexrank summary must be concatinated and split up by full stops.
    concat_list_summary = '. '.join([str(item).replace('.','') for item in list_summary])#.split('. ')
    concat_list_summary = concat_list_summary.replace('\\n','')
    concat_list_summary = concat_list_summary.replace('. ','.\n')+'.'
    
    return concat_list_summary

def get_Lexrank(text, nr_sentences):
    summary=[]
    LANGUAGE = "english"
    SENTENCES_COUNT = nr_sentences
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = LexRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        summary.append(sentence)

    return summary