Update app.py
Browse files
app.py
CHANGED
@@ -1,17 +1,21 @@
|
|
1 |
import gradio as gr
|
2 |
from transformers import pipeline, Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizer
|
3 |
from datasets import load_dataset
|
|
|
4 |
|
5 |
# Carica il dataset di spam detection da Hugging Face
|
6 |
dataset = load_dataset("tanquangduong/spam-detection-dataset-splits")
|
7 |
|
|
|
|
|
|
|
8 |
# Carica il tokenizer e il modello pre-addestrato
|
9 |
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
10 |
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
|
11 |
|
12 |
-
#
|
13 |
def tokenize_function(examples):
|
14 |
-
return tokenizer(examples['
|
15 |
|
16 |
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
17 |
|
|
|
1 |
import gradio as gr
|
2 |
from transformers import pipeline, Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizer
|
3 |
from datasets import load_dataset
|
4 |
+
import torch
|
5 |
|
6 |
# Carica il dataset di spam detection da Hugging Face
|
7 |
dataset = load_dataset("tanquangduong/spam-detection-dataset-splits")
|
8 |
|
9 |
+
# Visualizza i nomi delle colonne per verificare quale contiene il testo delle email
|
10 |
+
print(dataset['train'].column_names)
|
11 |
+
|
12 |
# Carica il tokenizer e il modello pre-addestrato
|
13 |
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
14 |
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
|
15 |
|
16 |
+
# Aggiorna il nome della colonna con il nome corretto
|
17 |
def tokenize_function(examples):
|
18 |
+
return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)
|
19 |
|
20 |
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
21 |
|