oliverguhr
commited on
Commit
·
c5c8dd0
1
Parent(s):
d61f2ce
removed sample as it is more confusing than helpful
Browse files
README.md
CHANGED
@@ -48,56 +48,6 @@ The code above will output following list:
|
|
48 |
["negative","negative","positive","positive","neutral", "neutral"]
|
49 |
```
|
50 |
|
51 |
-
## A minimal working Sample
|
52 |
-
|
53 |
-
|
54 |
-
```python
|
55 |
-
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
56 |
-
from typing import List
|
57 |
-
import torch
|
58 |
-
import re
|
59 |
-
|
60 |
-
class SentimentModel():
|
61 |
-
def __init__(self, model_name: str):
|
62 |
-
self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
63 |
-
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
64 |
-
|
65 |
-
self.clean_chars = re.compile(r'[^A-Za-züöäÖÜÄß ]', re.MULTILINE)
|
66 |
-
self.clean_http_urls = re.compile(r'https*\\S+', re.MULTILINE)
|
67 |
-
self.clean_at_mentions = re.compile(r'@\\S+', re.MULTILINE)
|
68 |
-
|
69 |
-
def predict_sentiment(self, texts: List[str])-> List[str]:
|
70 |
-
texts = [self.clean_text(text) for text in texts]
|
71 |
-
# Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
|
72 |
-
encoded = self.tokenizer.batch_encode_plus(texts,padding=True, add_special_tokens=True,truncation=True, return_tensors="pt")
|
73 |
-
encoded = encoded.to(self.device)
|
74 |
-
with torch.no_grad():
|
75 |
-
logits = self.model(**encoded)
|
76 |
-
|
77 |
-
label_ids = torch.argmax(logits[0], axis=1)
|
78 |
-
return [self.model.config.id2label[label_id.item()] for label_id in label_ids]
|
79 |
-
|
80 |
-
def replace_numbers(self,text: str) -> str:
|
81 |
-
return text.replace("0"," null").replace("1"," eins").replace("2"," zwei").replace("3"," drei").replace("4"," vier").replace("5"," fünf").replace("6"," sechs").replace("7"," sieben").replace("8"," acht").replace("9"," neun")
|
82 |
-
|
83 |
-
def clean_text(self,text: str)-> str:
|
84 |
-
text = text.replace("\n", " ")
|
85 |
-
text = self.clean_http_urls.sub('',text)
|
86 |
-
text = self.clean_at_mentions.sub('',text)
|
87 |
-
text = self.replace_numbers(text)
|
88 |
-
text = self.clean_chars.sub('', text) # use only text chars
|
89 |
-
text = ' '.join(text.split()) # substitute multiple whitespace with single whitespace
|
90 |
-
text = text.strip().lower()
|
91 |
-
return text
|
92 |
-
|
93 |
-
texts = ["Mit keinem guten Ergebniss","Das war unfair", "Das ist gar nicht mal so gut",
|
94 |
-
"Total awesome!","nicht so schlecht wie erwartet", "Das ist gar nicht mal so schlecht",
|
95 |
-
"Der Test verlief positiv.","Sie fährt ein grünes Auto.", "Der Fall wurde an die Polzei übergeben."]
|
96 |
-
|
97 |
-
model = SentimentModel(model_name = "oliverguhr/german-sentiment-bert")
|
98 |
-
|
99 |
-
print(model.predict_sentiment(texts))
|
100 |
-
```
|
101 |
|
102 |
## Model and Data
|
103 |
|
|
|
48 |
["negative","negative","positive","positive","neutral", "neutral"]
|
49 |
```
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
## Model and Data
|
53 |
|