Spaces:
Sleeping
Sleeping
File size: 1,621 Bytes
f7c77c5 fcf9a3b f7c77c5 fcf9a3b f7c77c5 fcf9a3b f7c77c5 d3d473d 252c8c0 f7c77c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
#import nltk
#nltk.download('punkt')
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
def get_Summary(in_text, nr_sentences):
#sentences = in_text.split('. ')
# summarize small part of the text
#nr_sentences = 1 #len(sentences)
#print('nr_sentences: '+str(nr_sentences))
if nr_sentences == 0:
return 'Error: No sentences available', None
list_summary = get_Lexrank(in_text,nr_sentences)
# it can happen that for lexrank a sentence consists of multiple actual sentences,
# that are separated with full stops. Then the correspoinding timestamp cannot be found
# all items from the lexrank summary must be concatinated and split up by full stops.
concat_list_summary = '. '.join([str(item).replace('.','') for item in list_summary])#.split('. ')
concat_list_summary = concat_list_summary.replace('\\n','')
concat_list_summary = concat_list_summary.replace('. ','.\n')+'.'
return concat_list_summary
def get_Lexrank(text, nr_sentences):
summary=[]
LANGUAGE = "english"
SENTENCES_COUNT = nr_sentences
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = LexRankSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
for sentence in summarizer(parser.document, SENTENCES_COUNT):
summary.append(sentence)
return summary
|