Spaces:

Carlosito16
/

HXM-summarization

Runtime error

App Files Files Community

Carlosito16 commited on Feb 6, 2023

Commit

27aa731

1 Parent(s): eacb10b

Create helper_function.py

Browse files

Files changed (1) hide show

helper_function.py +131 -0

helper_function.py ADDED Viewed

	@@ -0,0 +1,131 @@

+def clear_input():
+    return ("", "")
+def camembert_generate_summary(article_text):
+   inputs = cmb_tokenizer([article_text], padding="max_length", truncation=True,
+                          max_length=512,
+                          return_tensors="pt")
+   input_ids = inputs.input_ids.to(device)
+   attention_mask = inputs.attention_mask.to(device)
+   output = cmb_model.generate(input_ids, attention_mask=attention_mask)
+   return cmb_tokenizer.decode(output[0], skip_special_tokens=True)
+def t5_generate_summary(article_text):
+    input_ids = t5_tokenizer(
+    [WHITESPACE_HANDLER(article_text)],
+    return_tensors="pt",
+    padding="max_length",
+    truncation=True,
+    max_length=512)["input_ids"]
+    output_ids = t5_model.generate(
+        input_ids=input_ids,
+        max_length=84,
+        no_repeat_ngram_size=2,
+        num_beams=4
+    )[0]
+    output = t5_tokenizer.decode(
+    output_ids,
+    skip_special_tokens=True,
+    clean_up_tokenization_spaces=False
+    )
+    return output
+def summarizer(dropdown_model, article_text):
+    """
+    Ruturs a summarized version from the full article based on the selected pretrained-model
+    """
+    if dropdown_model == 'camembert':
+        summary = camembert_generate_summary(article_text)
+    elif dropdown_model == 'T5':
+        summary = t5_generate_summary(article_text)
+    return summary
+class keyWordExtractor():
+    def __init__(self,
+                 article_text,
+                 similarity_model,
+                 n_gram = 1,
+                 top_n = 3,
+                 french_stopwords = None,
+                 ner= None,
+                 ):
+        self.article_text = article_text
+        self.french_stopwords = french_stopwords
+        self.candidates = self.count_vectorizer(n_gram)
+        self.noun_candidates, self.proper_noun_candidates = self.slice_only_noun_token(ner, self.candidates)
+        self.top_n_keywords = self.top_n_extractor(similarity_model, top_n)
+    def count_vectorizer(self, n_gram):
+        n_gram_range = (n_gram, n_gram)
+        # Extract candidate words/phrases
+        count = CountVectorizer(ngram_range=n_gram_range,
+                        stop_words = self.french_stopwords).fit([self.article_text]) #Main change
+        candidates = count.get_feature_names_out()
+        return candidates
+    def slice_only_noun_token(self, ner, token_list):
+        """
+        Given the tokenized list, this function returns only the "NOUN" token
+            Args:
+                ner (spacy): The NER class to detect the `token.pos_`
+                token_list (list): List of token from the full article
+            Returns:
+                slice_list (list): List of token containing only "NOUN" part of speech
+        """
+        noun_slice_list = []
+        proper_noun_slice_list = []
+        for word_idx in range(len(token_list)):
+            doc = ner(token_list[word_idx])
+            for token in doc:
+                if token.pos_ == 'NOUN':
+                    noun_slice_list.append(token.text)
+                elif token.pos_ == 'PROPN':
+                    proper_noun_slice_list.append(token.text)
+        return noun_slice_list, proper_noun_slice_list
+    def top_n_extractor(self, model, top_n):
+        doc_embedding = model.encode([self.article_text])
+        candidate_embeddings = model.encode(self.noun_candidates)
+        distances = cosine_similarity(doc_embedding, candidate_embeddings)
+        keywords = [self.noun_candidates[index] for index in distances.argsort()[0][-top_n:]]
+        return keywords
+def extract_top_3(article):
+    nlp = spacy.load("fr_core_news_md")
+    # model = SentenceTransformer("dangvantuan/sentence-camembert-large") #
+    a= keyWordExtractor(article,
+                        n_gram = 1,
+                        top_n = 3,
+                        ner = nlp,
+                        similarity_model = model)
+    keyword = ", ".join(a.top_n_keywords) #to return ['a' , 'b'] >> "a, b"
+    proper_nonuns = ", ".join(a.proper_noun_candidates)
+    return keyword, proper_nonuns
+def runall(dropdown_model, article_text):
+    summary = summarizer(dropdown_model, article_text)
+    keywords, proper_n = extract_top_3(article_text)
+    return summary, keywords, proper_n