|
# A model for predicting the category of news article |
|
## Usage: |
|
|
|
``` |
|
import re |
|
from transformers import pipeline |
|
from html import unescape |
|
from unicodedata import normalize |
|
|
|
re_multispace = re.compile(r"\s+") |
|
|
|
def normalize_text(text): |
|
if text == None: |
|
return None |
|
|
|
text = text.strip() |
|
text = text.replace("\n", " ") |
|
text = text.replace("\t", " ") |
|
text = text.replace("\r", " ") |
|
text = re_multispace.sub(" ", text) |
|
text = unescape(text) |
|
text = normalize("NFKC", text) |
|
return text |
|
|
|
|
|
model = pipeline(task="text-classification", |
|
model=f"hynky/Category", tokenizer="ufal/robeczech-base", |
|
truncation=True, max_length=512, |
|
top_k=5 |
|
) |
|
|
|
|
|
def predict(article): |
|
article = normalize_text(article) |
|
predictions = model(article) |
|
|
|
predict("Dnes v noci bude pršet.") |
|
``` |