Spaces:
Sleeping
Sleeping
discard short sentences
Browse files- app.py +18 -1
- lexrank.py +1 -1
app.py
CHANGED
@@ -17,6 +17,24 @@ def summarize(in_text):
|
|
17 |
print("downloading punkt file")
|
18 |
nltk.download('punkt')
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
target_tokens = 1024
|
21 |
|
22 |
in_sents = metrics.num_sentences(in_text)
|
@@ -25,7 +43,6 @@ def summarize(in_text):
|
|
25 |
n_tokens= metrics.num_tokens(out_text)
|
26 |
prev_n_tokens=0
|
27 |
for sen in range(2, in_sents):
|
28 |
-
#print(sen,in_sents,n_tokens)
|
29 |
if n_tokens >= target_tokens:
|
30 |
n_tokens = prev_n_tokens
|
31 |
break
|
|
|
17 |
print("downloading punkt file")
|
18 |
nltk.download('punkt')
|
19 |
|
20 |
+
in_longtext = []
|
21 |
+
# Discard all senteces that have less than 10 words in them
|
22 |
+
in_text_sentenses = in_text.split('.')
|
23 |
+
print(in_text_sentenses)
|
24 |
+
for sen in in_text_sentenses:
|
25 |
+
print(sen)
|
26 |
+
print(len(sen.split()))
|
27 |
+
if len(sen.split()) > 10:
|
28 |
+
in_longtext.append(sen)
|
29 |
+
in_text = '.'.join(in_longtext)+'.'
|
30 |
+
print('strip')
|
31 |
+
print(in_text)
|
32 |
+
|
33 |
+
# The size of the summary is limited to 1024
|
34 |
+
# The Lexrank algorith accepts only sentences as a limit
|
35 |
+
# We start with one sentece and check the token size
|
36 |
+
# Then increase the number of sentences until the tokensize
|
37 |
+
# of the next sentence exceed the limit
|
38 |
target_tokens = 1024
|
39 |
|
40 |
in_sents = metrics.num_sentences(in_text)
|
|
|
43 |
n_tokens= metrics.num_tokens(out_text)
|
44 |
prev_n_tokens=0
|
45 |
for sen in range(2, in_sents):
|
|
|
46 |
if n_tokens >= target_tokens:
|
47 |
n_tokens = prev_n_tokens
|
48 |
break
|
lexrank.py
CHANGED
@@ -23,7 +23,7 @@ def get_Summary(in_text, nr_sentences):
|
|
23 |
# all items from the lexrank summary must be concatinated and split up by full stops.
|
24 |
concat_list_summary = '. '.join([str(item).replace('.','') for item in list_summary])#.split('. ')
|
25 |
concat_list_summary = concat_list_summary.replace('\\n','')
|
26 |
-
concat_list_summary = concat_list_summary.replace('. ','.\n')
|
27 |
|
28 |
return concat_list_summary
|
29 |
|
|
|
23 |
# all items from the lexrank summary must be concatinated and split up by full stops.
|
24 |
concat_list_summary = '. '.join([str(item).replace('.','') for item in list_summary])#.split('. ')
|
25 |
concat_list_summary = concat_list_summary.replace('\\n','')
|
26 |
+
concat_list_summary = concat_list_summary.replace('. ','.\n')+'.'
|
27 |
|
28 |
return concat_list_summary
|
29 |
|