Spaces:

wldmr
/

lexrank-gr

Sleeping

wldmr commited on Mar 23, 2023

Commit

252c8c0

1 Parent(s): ee33df4

discard short sentences

Files changed (2) hide show

app.py CHANGED Viewed

@@ -17,6 +17,24 @@ def summarize(in_text):
         print("downloading punkt file")
         nltk.download('punkt')
     target_tokens = 1024
     in_sents = metrics.num_sentences(in_text)
@@ -25,7 +43,6 @@ def summarize(in_text):
     n_tokens= metrics.num_tokens(out_text)
     prev_n_tokens=0
     for sen in range(2, in_sents):
-        #print(sen,in_sents,n_tokens)
         if n_tokens >= target_tokens:
             n_tokens = prev_n_tokens
             break

         print("downloading punkt file")
         nltk.download('punkt')
+    in_longtext = []
+    # Discard all senteces that have less than 10 words in them
+    in_text_sentenses = in_text.split('.')
+    print(in_text_sentenses)
+    for sen in in_text_sentenses:
+        print(sen)
+        print(len(sen.split()))
+        if len(sen.split()) > 10:
+            in_longtext.append(sen)
+    in_text = '.'.join(in_longtext)+'.'
+    print('strip')
+    print(in_text)
+    # The size of the summary is limited to 1024
+    # The Lexrank algorith accepts only sentences as a limit
+    # We start with one sentece and check the token size
+    # Then increase the number of sentences until the tokensize
+    # of the next sentence exceed the limit
     target_tokens = 1024
     in_sents = metrics.num_sentences(in_text)
     n_tokens= metrics.num_tokens(out_text)
     prev_n_tokens=0
     for sen in range(2, in_sents):
         if n_tokens >= target_tokens:
             n_tokens = prev_n_tokens
             break

lexrank.py CHANGED Viewed

@@ -23,7 +23,7 @@ def get_Summary(in_text, nr_sentences):
     # all items from the lexrank summary must be concatinated and split up by full stops.
     concat_list_summary = '. '.join([str(item).replace('.','') for item in list_summary])#.split('. ')
     concat_list_summary = concat_list_summary.replace('\\n','')
-    concat_list_summary = concat_list_summary.replace('. ','.\n')
     return concat_list_summary

     # all items from the lexrank summary must be concatinated and split up by full stops.
     concat_list_summary = '. '.join([str(item).replace('.','') for item in list_summary])#.split('. ')
     concat_list_summary = concat_list_summary.replace('\\n','')
+    concat_list_summary = concat_list_summary.replace('. ','.\n')+'.'
     return concat_list_summary