Spaces:
Sleeping
Sleeping
Nikhil Singh
commited on
Commit
·
1afbc3a
1
Parent(s):
9fe2871
t5 add
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ from mailparser import parse_from_file
|
|
4 |
from bs4 import BeautifulSoup
|
5 |
from gliner import GLiNER
|
6 |
from typing import Dict, Union, List
|
|
|
7 |
|
8 |
import spacy
|
9 |
import re
|
@@ -11,6 +12,9 @@ import os
|
|
11 |
import en_core_web_sm
|
12 |
nlp = en_core_web_sm.load()
|
13 |
|
|
|
|
|
|
|
14 |
_MODEL = {}
|
15 |
_CACHE_DIR = os.environ.get("CACHE_DIR", None)
|
16 |
|
@@ -58,6 +62,13 @@ def parse_query(sentences: List[str], labels: List[str], threshold: float = 0.3,
|
|
58 |
|
59 |
return results
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
def present(email_file, labels, multilingual=False):
|
62 |
email = accept_mail(email_file)
|
63 |
cleaned_text = clean_email(email)
|
@@ -96,7 +107,8 @@ demo = gr.Interface(
|
|
96 |
gr.components.Textbox(label="From"),
|
97 |
gr.components.Textbox(label="To"),
|
98 |
gr.components.Textbox(label="Date"),
|
99 |
-
gr.components.Dataframe(headers=["Text", "Label"], label="Extracted Entities")
|
|
|
100 |
],
|
101 |
title="Email Info Extractor",
|
102 |
description="Upload an email file (.eml) to extract its details and detected entities."
|
|
|
4 |
from bs4 import BeautifulSoup
|
5 |
from gliner import GLiNER
|
6 |
from typing import Dict, Union, List
|
7 |
+
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
8 |
|
9 |
import spacy
|
10 |
import re
|
|
|
12 |
import en_core_web_sm
|
13 |
nlp = en_core_web_sm.load()
|
14 |
|
15 |
+
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
16 |
+
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
17 |
+
|
18 |
_MODEL = {}
|
19 |
_CACHE_DIR = os.environ.get("CACHE_DIR", None)
|
20 |
|
|
|
62 |
|
63 |
return results
|
64 |
|
65 |
+
def refine_entities_with_t5(entities):
|
66 |
+
inputs = "refine entities: " + " ; ".join([f"{entity['text']} as {entity['label']}" for entity in entities])
|
67 |
+
input_ids = t5_tokenizer.encode(inputs, return_tensors="pt", add_special_tokens=True)
|
68 |
+
outputs = t5_model.generate(input_ids)
|
69 |
+
result = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
70 |
+
return result
|
71 |
+
|
72 |
def present(email_file, labels, multilingual=False):
|
73 |
email = accept_mail(email_file)
|
74 |
cleaned_text = clean_email(email)
|
|
|
107 |
gr.components.Textbox(label="From"),
|
108 |
gr.components.Textbox(label="To"),
|
109 |
gr.components.Textbox(label="Date"),
|
110 |
+
gr.components.Dataframe(headers=["Text", "Label"], label="Extracted Entities"),
|
111 |
+
gr.components.Textbox(label="Refined Entities")
|
112 |
],
|
113 |
title="Email Info Extractor",
|
114 |
description="Upload an email file (.eml) to extract its details and detected entities."
|