Spaces:
Runtime error
Runtime error
Commit
·
27aa731
1
Parent(s):
eacb10b
Create helper_function.py
Browse files- helper_function.py +131 -0
helper_function.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
def clear_input():
|
3 |
+
return ("", "")
|
4 |
+
|
5 |
+
|
6 |
+
def camembert_generate_summary(article_text):
|
7 |
+
inputs = cmb_tokenizer([article_text], padding="max_length", truncation=True,
|
8 |
+
max_length=512,
|
9 |
+
return_tensors="pt")
|
10 |
+
input_ids = inputs.input_ids.to(device)
|
11 |
+
attention_mask = inputs.attention_mask.to(device)
|
12 |
+
output = cmb_model.generate(input_ids, attention_mask=attention_mask)
|
13 |
+
return cmb_tokenizer.decode(output[0], skip_special_tokens=True)
|
14 |
+
|
15 |
+
|
16 |
+
def t5_generate_summary(article_text):
|
17 |
+
input_ids = t5_tokenizer(
|
18 |
+
[WHITESPACE_HANDLER(article_text)],
|
19 |
+
return_tensors="pt",
|
20 |
+
padding="max_length",
|
21 |
+
truncation=True,
|
22 |
+
max_length=512)["input_ids"]
|
23 |
+
|
24 |
+
output_ids = t5_model.generate(
|
25 |
+
input_ids=input_ids,
|
26 |
+
max_length=84,
|
27 |
+
no_repeat_ngram_size=2,
|
28 |
+
num_beams=4
|
29 |
+
)[0]
|
30 |
+
|
31 |
+
output = t5_tokenizer.decode(
|
32 |
+
output_ids,
|
33 |
+
skip_special_tokens=True,
|
34 |
+
clean_up_tokenization_spaces=False
|
35 |
+
)
|
36 |
+
|
37 |
+
return output
|
38 |
+
|
39 |
+
def summarizer(dropdown_model, article_text):
|
40 |
+
"""
|
41 |
+
Ruturs a summarized version from the full article based on the selected pretrained-model
|
42 |
+
"""
|
43 |
+
|
44 |
+
if dropdown_model == 'camembert':
|
45 |
+
summary = camembert_generate_summary(article_text)
|
46 |
+
|
47 |
+
elif dropdown_model == 'T5':
|
48 |
+
summary = t5_generate_summary(article_text)
|
49 |
+
|
50 |
+
return summary
|
51 |
+
|
52 |
+
|
53 |
+
class keyWordExtractor():
|
54 |
+
|
55 |
+
def __init__(self,
|
56 |
+
article_text,
|
57 |
+
similarity_model,
|
58 |
+
n_gram = 1,
|
59 |
+
top_n = 3,
|
60 |
+
french_stopwords = None,
|
61 |
+
ner= None,
|
62 |
+
):
|
63 |
+
self.article_text = article_text
|
64 |
+
self.french_stopwords = french_stopwords
|
65 |
+
self.candidates = self.count_vectorizer(n_gram)
|
66 |
+
self.noun_candidates, self.proper_noun_candidates = self.slice_only_noun_token(ner, self.candidates)
|
67 |
+
self.top_n_keywords = self.top_n_extractor(similarity_model, top_n)
|
68 |
+
|
69 |
+
def count_vectorizer(self, n_gram):
|
70 |
+
n_gram_range = (n_gram, n_gram)
|
71 |
+
# Extract candidate words/phrases
|
72 |
+
count = CountVectorizer(ngram_range=n_gram_range,
|
73 |
+
stop_words = self.french_stopwords).fit([self.article_text]) #Main change
|
74 |
+
candidates = count.get_feature_names_out()
|
75 |
+
|
76 |
+
return candidates
|
77 |
+
|
78 |
+
def slice_only_noun_token(self, ner, token_list):
|
79 |
+
"""
|
80 |
+
Given the tokenized list, this function returns only the "NOUN" token
|
81 |
+
Args:
|
82 |
+
ner (spacy): The NER class to detect the `token.pos_`
|
83 |
+
token_list (list): List of token from the full article
|
84 |
+
|
85 |
+
Returns:
|
86 |
+
slice_list (list): List of token containing only "NOUN" part of speech
|
87 |
+
"""
|
88 |
+
|
89 |
+
noun_slice_list = []
|
90 |
+
proper_noun_slice_list = []
|
91 |
+
for word_idx in range(len(token_list)):
|
92 |
+
doc = ner(token_list[word_idx])
|
93 |
+
|
94 |
+
for token in doc:
|
95 |
+
if token.pos_ == 'NOUN':
|
96 |
+
noun_slice_list.append(token.text)
|
97 |
+
elif token.pos_ == 'PROPN':
|
98 |
+
proper_noun_slice_list.append(token.text)
|
99 |
+
|
100 |
+
return noun_slice_list, proper_noun_slice_list
|
101 |
+
|
102 |
+
def top_n_extractor(self, model, top_n):
|
103 |
+
doc_embedding = model.encode([self.article_text])
|
104 |
+
candidate_embeddings = model.encode(self.noun_candidates)
|
105 |
+
distances = cosine_similarity(doc_embedding, candidate_embeddings)
|
106 |
+
keywords = [self.noun_candidates[index] for index in distances.argsort()[0][-top_n:]]
|
107 |
+
|
108 |
+
return keywords
|
109 |
+
|
110 |
+
|
111 |
+
|
112 |
+
def extract_top_3(article):
|
113 |
+
nlp = spacy.load("fr_core_news_md")
|
114 |
+
# model = SentenceTransformer("dangvantuan/sentence-camembert-large") #
|
115 |
+
|
116 |
+
a= keyWordExtractor(article,
|
117 |
+
n_gram = 1,
|
118 |
+
top_n = 3,
|
119 |
+
ner = nlp,
|
120 |
+
similarity_model = model)
|
121 |
+
keyword = ", ".join(a.top_n_keywords) #to return ['a' , 'b'] >> "a, b"
|
122 |
+
proper_nonuns = ", ".join(a.proper_noun_candidates)
|
123 |
+
|
124 |
+
return keyword, proper_nonuns
|
125 |
+
|
126 |
+
|
127 |
+
def runall(dropdown_model, article_text):
|
128 |
+
summary = summarizer(dropdown_model, article_text)
|
129 |
+
keywords, proper_n = extract_top_3(article_text)
|
130 |
+
|
131 |
+
return summary, keywords, proper_n
|