def clear_input(): return ("", "") def camembert_generate_summary(article_text): inputs = cmb_tokenizer([article_text], padding="max_length", truncation=True, max_length=512, return_tensors="pt") input_ids = attention_mask = output = cmb_model.generate(input_ids, attention_mask=attention_mask) return cmb_tokenizer.decode(output[0], skip_special_tokens=True) def t5_generate_summary(article_text): input_ids = t5_tokenizer( [WHITESPACE_HANDLER(article_text)], return_tensors="pt", padding="max_length", truncation=True, max_length=512)["input_ids"] output_ids = t5_model.generate( input_ids=input_ids, max_length=84, no_repeat_ngram_size=2, num_beams=4 )[0] output = t5_tokenizer.decode( output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False ) return output def summarizer(dropdown_model, article_text): """ Ruturs a summarized version from the full article based on the selected pretrained-model """ if dropdown_model == 'camembert': summary = camembert_generate_summary(article_text) elif dropdown_model == 'T5': summary = t5_generate_summary(article_text) return summary class keyWordExtractor(): def __init__(self, article_text, similarity_model, n_gram = 1, top_n = 3, french_stopwords = None, ner= None, ): self.article_text = article_text self.french_stopwords = french_stopwords self.candidates = self.count_vectorizer(n_gram) self.noun_candidates, self.proper_noun_candidates = self.slice_only_noun_token(ner, self.candidates) self.top_n_keywords = self.top_n_extractor(similarity_model, top_n) def count_vectorizer(self, n_gram): n_gram_range = (n_gram, n_gram) # Extract candidate words/phrases count = CountVectorizer(ngram_range=n_gram_range, stop_words = self.french_stopwords).fit([self.article_text]) #Main change candidates = count.get_feature_names_out() return candidates def slice_only_noun_token(self, ner, token_list): """ Given the tokenized list, this function returns only the "NOUN" token Args: ner (spacy): The NER class to detect the `token.pos_` token_list (list): List of token from the full article Returns: slice_list (list): List of token containing only "NOUN" part of speech """ noun_slice_list = [] proper_noun_slice_list = [] for word_idx in range(len(token_list)): doc = ner(token_list[word_idx]) for token in doc: if token.pos_ == 'NOUN': noun_slice_list.append(token.text) elif token.pos_ == 'PROPN': proper_noun_slice_list.append(token.text) return noun_slice_list, proper_noun_slice_list def top_n_extractor(self, model, top_n): doc_embedding = model.encode([self.article_text]) candidate_embeddings = model.encode(self.noun_candidates) distances = cosine_similarity(doc_embedding, candidate_embeddings) keywords = [self.noun_candidates[index] for index in distances.argsort()[0][-top_n:]] return keywords def extract_top_3(article): nlp = spacy.load("fr_core_news_md") # model = SentenceTransformer("dangvantuan/sentence-camembert-large") # a= keyWordExtractor(article, n_gram = 1, top_n = 3, ner = nlp, similarity_model = model) keyword = ", ".join(a.top_n_keywords) #to return ['a' , 'b'] >> "a, b" proper_nonuns = ", ".join(a.proper_noun_candidates) return keyword, proper_nonuns def runall(dropdown_model, article_text): summary = summarizer(dropdown_model, article_text) keywords, proper_n = extract_top_3(article_text) return summary, keywords, proper_n