Spaces:
Sleeping
Sleeping
moving get_dataset_dict_object outside of preprocessor class into utils
Browse files- src/model_training.py +2 -16
- src/utils.py +16 -1
src/model_training.py
CHANGED
@@ -2,12 +2,11 @@ import numpy as np
|
|
2 |
import pandas as pd
|
3 |
import torch
|
4 |
import torch.nn as nn
|
5 |
-
from datasets import DatasetDict, Dataset
|
6 |
from sklearn.linear_model import LogisticRegression
|
7 |
from sklearn.metrics import accuracy_score, f1_score
|
8 |
from transformers import AutoModel, AutoTokenizer
|
9 |
|
10 |
-
from .utils import serialize_data, load_data
|
11 |
|
12 |
|
13 |
class PreProcessor:
|
@@ -18,19 +17,6 @@ class PreProcessor:
|
|
18 |
self.df_train = pd.read_csv(train_path, sep="\t")
|
19 |
self.df_test = pd.read_csv(test_path, sep="\t")
|
20 |
self.output_path = output_path
|
21 |
-
|
22 |
-
def _get_datasetdict_object(self):
|
23 |
-
mapper = {"#2_tweet": "tweet", "#3_country_label": "label"}
|
24 |
-
columns_to_keep = ["tweet", "label"]
|
25 |
-
|
26 |
-
df_train = self.df_train.rename(columns=mapper)[columns_to_keep]
|
27 |
-
df_test = self.df_test.rename(columns=mapper)[columns_to_keep]
|
28 |
-
|
29 |
-
train_dataset = Dataset.from_pandas(df_train)
|
30 |
-
test_dataset = Dataset.from_pandas(df_test)
|
31 |
-
data = DatasetDict({'train': train_dataset, 'test': test_dataset})
|
32 |
-
|
33 |
-
return data
|
34 |
|
35 |
def _tokenize(self, batch):
|
36 |
return self.tokenizer(batch["tweet"], padding=True)
|
@@ -53,7 +39,7 @@ class PreProcessor:
|
|
53 |
return data_hidden
|
54 |
|
55 |
def preprocess_data(self):
|
56 |
-
data = self.
|
57 |
data_encoded = self._encode_data(data)
|
58 |
data_hidden = self._get_features(data_encoded)
|
59 |
serialize_data(data_hidden, output_path=self.output_path)
|
|
|
2 |
import pandas as pd
|
3 |
import torch
|
4 |
import torch.nn as nn
|
|
|
5 |
from sklearn.linear_model import LogisticRegression
|
6 |
from sklearn.metrics import accuracy_score, f1_score
|
7 |
from transformers import AutoModel, AutoTokenizer
|
8 |
|
9 |
+
from .utils import serialize_data, load_data, get_datasetdict_object
|
10 |
|
11 |
|
12 |
class PreProcessor:
|
|
|
17 |
self.df_train = pd.read_csv(train_path, sep="\t")
|
18 |
self.df_test = pd.read_csv(test_path, sep="\t")
|
19 |
self.output_path = output_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
def _tokenize(self, batch):
|
22 |
return self.tokenizer(batch["tweet"], padding=True)
|
|
|
39 |
return data_hidden
|
40 |
|
41 |
def preprocess_data(self):
|
42 |
+
data = get_datasetdict_object(self.df_train, self.df_test)
|
43 |
data_encoded = self._encode_data(data)
|
44 |
data_hidden = self._get_features(data_encoded)
|
45 |
serialize_data(data_hidden, output_path=self.output_path)
|
src/utils.py
CHANGED
@@ -2,8 +2,23 @@ import pickle
|
|
2 |
|
3 |
import matplotlib.pyplot as plt
|
4 |
import seaborn as sns
|
5 |
-
from sklearn.metrics import confusion_matrix
|
6 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
def extract_hidden_state(input_text, tokenizer, language_model):
|
|
|
2 |
|
3 |
import matplotlib.pyplot as plt
|
4 |
import seaborn as sns
|
|
|
5 |
import torch
|
6 |
+
from datasets import DatasetDict, Dataset
|
7 |
+
from sklearn.metrics import confusion_matrix
|
8 |
+
|
9 |
+
|
10 |
+
def get_datasetdict_object(df_train, df_test):
|
11 |
+
mapper = {"#2_tweet": "tweet", "#3_country_label": "label"}
|
12 |
+
columns_to_keep = ["tweet", "label"]
|
13 |
+
|
14 |
+
df_train = df_train.rename(columns=mapper)[columns_to_keep]
|
15 |
+
df_test = df_test.rename(columns=mapper)[columns_to_keep]
|
16 |
+
|
17 |
+
train_dataset = Dataset.from_pandas(df_train)
|
18 |
+
test_dataset = Dataset.from_pandas(df_test)
|
19 |
+
data = DatasetDict({'train': train_dataset, 'test': test_dataset})
|
20 |
+
|
21 |
+
return data
|
22 |
|
23 |
|
24 |
def extract_hidden_state(input_text, tokenizer, language_model):
|