wldmr commited on
Commit
252c8c0
·
1 Parent(s): ee33df4

discard short sentences

Browse files
Files changed (2) hide show
  1. app.py +18 -1
  2. lexrank.py +1 -1
app.py CHANGED
@@ -17,6 +17,24 @@ def summarize(in_text):
17
  print("downloading punkt file")
18
  nltk.download('punkt')
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  target_tokens = 1024
21
 
22
  in_sents = metrics.num_sentences(in_text)
@@ -25,7 +43,6 @@ def summarize(in_text):
25
  n_tokens= metrics.num_tokens(out_text)
26
  prev_n_tokens=0
27
  for sen in range(2, in_sents):
28
- #print(sen,in_sents,n_tokens)
29
  if n_tokens >= target_tokens:
30
  n_tokens = prev_n_tokens
31
  break
 
17
  print("downloading punkt file")
18
  nltk.download('punkt')
19
 
20
+ in_longtext = []
21
+ # Discard all senteces that have less than 10 words in them
22
+ in_text_sentenses = in_text.split('.')
23
+ print(in_text_sentenses)
24
+ for sen in in_text_sentenses:
25
+ print(sen)
26
+ print(len(sen.split()))
27
+ if len(sen.split()) > 10:
28
+ in_longtext.append(sen)
29
+ in_text = '.'.join(in_longtext)+'.'
30
+ print('strip')
31
+ print(in_text)
32
+
33
+ # The size of the summary is limited to 1024
34
+ # The Lexrank algorith accepts only sentences as a limit
35
+ # We start with one sentece and check the token size
36
+ # Then increase the number of sentences until the tokensize
37
+ # of the next sentence exceed the limit
38
  target_tokens = 1024
39
 
40
  in_sents = metrics.num_sentences(in_text)
 
43
  n_tokens= metrics.num_tokens(out_text)
44
  prev_n_tokens=0
45
  for sen in range(2, in_sents):
 
46
  if n_tokens >= target_tokens:
47
  n_tokens = prev_n_tokens
48
  break
lexrank.py CHANGED
@@ -23,7 +23,7 @@ def get_Summary(in_text, nr_sentences):
23
  # all items from the lexrank summary must be concatinated and split up by full stops.
24
  concat_list_summary = '. '.join([str(item).replace('.','') for item in list_summary])#.split('. ')
25
  concat_list_summary = concat_list_summary.replace('\\n','')
26
- concat_list_summary = concat_list_summary.replace('. ','.\n')
27
 
28
  return concat_list_summary
29
 
 
23
  # all items from the lexrank summary must be concatinated and split up by full stops.
24
  concat_list_summary = '. '.join([str(item).replace('.','') for item in list_summary])#.split('. ')
25
  concat_list_summary = concat_list_summary.replace('\\n','')
26
+ concat_list_summary = concat_list_summary.replace('. ','.\n')+'.'
27
 
28
  return concat_list_summary
29