Carlosito16 commited on
Commit
27aa731
·
1 Parent(s): eacb10b

Create helper_function.py

Browse files
Files changed (1) hide show
  1. helper_function.py +131 -0
helper_function.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ def clear_input():
3
+ return ("", "")
4
+
5
+
6
+ def camembert_generate_summary(article_text):
7
+ inputs = cmb_tokenizer([article_text], padding="max_length", truncation=True,
8
+ max_length=512,
9
+ return_tensors="pt")
10
+ input_ids = inputs.input_ids.to(device)
11
+ attention_mask = inputs.attention_mask.to(device)
12
+ output = cmb_model.generate(input_ids, attention_mask=attention_mask)
13
+ return cmb_tokenizer.decode(output[0], skip_special_tokens=True)
14
+
15
+
16
+ def t5_generate_summary(article_text):
17
+ input_ids = t5_tokenizer(
18
+ [WHITESPACE_HANDLER(article_text)],
19
+ return_tensors="pt",
20
+ padding="max_length",
21
+ truncation=True,
22
+ max_length=512)["input_ids"]
23
+
24
+ output_ids = t5_model.generate(
25
+ input_ids=input_ids,
26
+ max_length=84,
27
+ no_repeat_ngram_size=2,
28
+ num_beams=4
29
+ )[0]
30
+
31
+ output = t5_tokenizer.decode(
32
+ output_ids,
33
+ skip_special_tokens=True,
34
+ clean_up_tokenization_spaces=False
35
+ )
36
+
37
+ return output
38
+
39
+ def summarizer(dropdown_model, article_text):
40
+ """
41
+ Ruturs a summarized version from the full article based on the selected pretrained-model
42
+ """
43
+
44
+ if dropdown_model == 'camembert':
45
+ summary = camembert_generate_summary(article_text)
46
+
47
+ elif dropdown_model == 'T5':
48
+ summary = t5_generate_summary(article_text)
49
+
50
+ return summary
51
+
52
+
53
+ class keyWordExtractor():
54
+
55
+ def __init__(self,
56
+ article_text,
57
+ similarity_model,
58
+ n_gram = 1,
59
+ top_n = 3,
60
+ french_stopwords = None,
61
+ ner= None,
62
+ ):
63
+ self.article_text = article_text
64
+ self.french_stopwords = french_stopwords
65
+ self.candidates = self.count_vectorizer(n_gram)
66
+ self.noun_candidates, self.proper_noun_candidates = self.slice_only_noun_token(ner, self.candidates)
67
+ self.top_n_keywords = self.top_n_extractor(similarity_model, top_n)
68
+
69
+ def count_vectorizer(self, n_gram):
70
+ n_gram_range = (n_gram, n_gram)
71
+ # Extract candidate words/phrases
72
+ count = CountVectorizer(ngram_range=n_gram_range,
73
+ stop_words = self.french_stopwords).fit([self.article_text]) #Main change
74
+ candidates = count.get_feature_names_out()
75
+
76
+ return candidates
77
+
78
+ def slice_only_noun_token(self, ner, token_list):
79
+ """
80
+ Given the tokenized list, this function returns only the "NOUN" token
81
+ Args:
82
+ ner (spacy): The NER class to detect the `token.pos_`
83
+ token_list (list): List of token from the full article
84
+
85
+ Returns:
86
+ slice_list (list): List of token containing only "NOUN" part of speech
87
+ """
88
+
89
+ noun_slice_list = []
90
+ proper_noun_slice_list = []
91
+ for word_idx in range(len(token_list)):
92
+ doc = ner(token_list[word_idx])
93
+
94
+ for token in doc:
95
+ if token.pos_ == 'NOUN':
96
+ noun_slice_list.append(token.text)
97
+ elif token.pos_ == 'PROPN':
98
+ proper_noun_slice_list.append(token.text)
99
+
100
+ return noun_slice_list, proper_noun_slice_list
101
+
102
+ def top_n_extractor(self, model, top_n):
103
+ doc_embedding = model.encode([self.article_text])
104
+ candidate_embeddings = model.encode(self.noun_candidates)
105
+ distances = cosine_similarity(doc_embedding, candidate_embeddings)
106
+ keywords = [self.noun_candidates[index] for index in distances.argsort()[0][-top_n:]]
107
+
108
+ return keywords
109
+
110
+
111
+
112
+ def extract_top_3(article):
113
+ nlp = spacy.load("fr_core_news_md")
114
+ # model = SentenceTransformer("dangvantuan/sentence-camembert-large") #
115
+
116
+ a= keyWordExtractor(article,
117
+ n_gram = 1,
118
+ top_n = 3,
119
+ ner = nlp,
120
+ similarity_model = model)
121
+ keyword = ", ".join(a.top_n_keywords) #to return ['a' , 'b'] >> "a, b"
122
+ proper_nonuns = ", ".join(a.proper_noun_candidates)
123
+
124
+ return keyword, proper_nonuns
125
+
126
+
127
+ def runall(dropdown_model, article_text):
128
+ summary = summarizer(dropdown_model, article_text)
129
+ keywords, proper_n = extract_top_3(article_text)
130
+
131
+ return summary, keywords, proper_n