File size: 2,455 Bytes
cdc5783
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import re

from langdetect import detect
from transformers import pipeline

from utils.tag_utils import filter_tags

AiSummaryVersion = 1
summarization_pipeline = pipeline("summarization", model="csebuetnlp/mT5_multilingual_XLSum")
en_translation_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
classification_pipe = pipeline("text-classification", model="Yueh-Huan/news-category-classification-distilbert")
tag_gen_pipe = pipeline("text2text-generation", model="fabiochiu/t5-base-tag-generation")


def summarize(text: str):
    if text is None or len(text) < 10:
        return {
            "ver": AiSummaryVersion
        }
    summary = get_summarization(text) if len(text) > 100 else text
    translated = get_en_translation(summary)
    tags1 = get_classification(translated)
    tags2 = get_tags(translated)
    tags = filter_tags(list(set(tags1 + tags2)))

    return {
        "ver": AiSummaryVersion,
        "summary": summary,
        "tags": tags,
    }


def get_summarization(text: str):
    try:
        result = summarization_pipeline(text)
        return result[0]['summary_text'] if isinstance(result, list) else result['summary_text']
    except:
        return None


def get_en_translation(text: str):
    if text is None:
        return None
    try:
        if is_english(text):
            return text
        result = en_translation_pipe(text)
        return result[0]['translation_text'] if isinstance(result, list) else result['translation_text']
    except:
        return None


def is_english(text):
    try:
        lang = detect(text)
        return lang == 'en'
    except:
        return False


def get_tags(text: str):
    if text is None:
        return []
    try:
        result = tag_gen_pipe(text)
        tag_str = result[0]['generated_text'] if isinstance(result, list) else result['generated_text']
        tags = re.split(r'[&,]', tag_str)
        tags = [tag.strip() for tag in tags]
        tags = [tag for tag in tags if len(tag) > 2 and len(tag.split(' ')) == 1]
        return tags
    except:
        return []


def get_classification(text: str):
    if text is None:
        return []
    try:
        result = classification_pipe(text)
        if isinstance(result, list):
            return [tag['label'].strip() for tag in result if tag['score'] > 0.75]
        else:
            return [result['label'].strip()] if result['score'] > 0.75 else []
    except:
        return []