Spaces:
Runtime error
Runtime error
File size: 4,503 Bytes
971f803 27aa731 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
def get_all_models():
with open("requirements.txt") as f:
content = f.readlines()
models = []
for line in content:
if "huggingface.co" in line:
models.append(line.split("/")[4])
return models
def clear_input():
return ("", "")
def camembert_generate_summary(article_text):
inputs = cmb_tokenizer([article_text], padding="max_length", truncation=True,
max_length=512,
return_tensors="pt")
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)
output = cmb_model.generate(input_ids, attention_mask=attention_mask)
return cmb_tokenizer.decode(output[0], skip_special_tokens=True)
def t5_generate_summary(article_text):
input_ids = t5_tokenizer(
[WHITESPACE_HANDLER(article_text)],
return_tensors="pt",
padding="max_length",
truncation=True,
max_length=512)["input_ids"]
output_ids = t5_model.generate(
input_ids=input_ids,
max_length=84,
no_repeat_ngram_size=2,
num_beams=4
)[0]
output = t5_tokenizer.decode(
output_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)
return output
def summarizer(dropdown_model, article_text):
"""
Ruturs a summarized version from the full article based on the selected pretrained-model
"""
if dropdown_model == 'camembert':
summary = camembert_generate_summary(article_text)
elif dropdown_model == 'T5':
summary = t5_generate_summary(article_text)
return summary
class keyWordExtractor():
def __init__(self,
article_text,
similarity_model,
n_gram = 1,
top_n = 3,
french_stopwords = None,
ner= None,
):
self.article_text = article_text
self.french_stopwords = french_stopwords
self.candidates = self.count_vectorizer(n_gram)
self.noun_candidates, self.proper_noun_candidates = self.slice_only_noun_token(ner, self.candidates)
self.top_n_keywords = self.top_n_extractor(similarity_model, top_n)
def count_vectorizer(self, n_gram):
n_gram_range = (n_gram, n_gram)
# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range,
stop_words = self.french_stopwords).fit([self.article_text]) #Main change
candidates = count.get_feature_names_out()
return candidates
def slice_only_noun_token(self, ner, token_list):
"""
Given the tokenized list, this function returns only the "NOUN" token
Args:
ner (spacy): The NER class to detect the `token.pos_`
token_list (list): List of token from the full article
Returns:
slice_list (list): List of token containing only "NOUN" part of speech
"""
noun_slice_list = []
proper_noun_slice_list = []
for word_idx in range(len(token_list)):
doc = ner(token_list[word_idx])
for token in doc:
if token.pos_ == 'NOUN':
noun_slice_list.append(token.text)
elif token.pos_ == 'PROPN':
proper_noun_slice_list.append(token.text)
return noun_slice_list, proper_noun_slice_list
def top_n_extractor(self, model, top_n):
doc_embedding = model.encode([self.article_text])
candidate_embeddings = model.encode(self.noun_candidates)
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [self.noun_candidates[index] for index in distances.argsort()[0][-top_n:]]
return keywords
def extract_top_3(article):
nlp = spacy.load("fr_core_news_md")
# model = SentenceTransformer("dangvantuan/sentence-camembert-large") #
a= keyWordExtractor(article,
n_gram = 1,
top_n = 3,
ner = nlp,
similarity_model = model)
keyword = ", ".join(a.top_n_keywords) #to return ['a' , 'b'] >> "a, b"
proper_nonuns = ", ".join(a.proper_noun_candidates)
return keyword, proper_nonuns
def runall(dropdown_model, article_text):
summary = summarizer(dropdown_model, article_text)
keywords, proper_n = extract_top_3(article_text)
return summary, keywords, proper_n |