zaidmehdi commited on
Commit
00cdd45
·
1 Parent(s): 8580c38

moving get_dataset_dict_object outside of preprocessor class into utils

Browse files
Files changed (2) hide show
  1. src/model_training.py +2 -16
  2. src/utils.py +16 -1
src/model_training.py CHANGED
@@ -2,12 +2,11 @@ import numpy as np
2
  import pandas as pd
3
  import torch
4
  import torch.nn as nn
5
- from datasets import DatasetDict, Dataset
6
  from sklearn.linear_model import LogisticRegression
7
  from sklearn.metrics import accuracy_score, f1_score
8
  from transformers import AutoModel, AutoTokenizer
9
 
10
- from .utils import serialize_data, load_data
11
 
12
 
13
  class PreProcessor:
@@ -18,19 +17,6 @@ class PreProcessor:
18
  self.df_train = pd.read_csv(train_path, sep="\t")
19
  self.df_test = pd.read_csv(test_path, sep="\t")
20
  self.output_path = output_path
21
-
22
- def _get_datasetdict_object(self):
23
- mapper = {"#2_tweet": "tweet", "#3_country_label": "label"}
24
- columns_to_keep = ["tweet", "label"]
25
-
26
- df_train = self.df_train.rename(columns=mapper)[columns_to_keep]
27
- df_test = self.df_test.rename(columns=mapper)[columns_to_keep]
28
-
29
- train_dataset = Dataset.from_pandas(df_train)
30
- test_dataset = Dataset.from_pandas(df_test)
31
- data = DatasetDict({'train': train_dataset, 'test': test_dataset})
32
-
33
- return data
34
 
35
  def _tokenize(self, batch):
36
  return self.tokenizer(batch["tweet"], padding=True)
@@ -53,7 +39,7 @@ class PreProcessor:
53
  return data_hidden
54
 
55
  def preprocess_data(self):
56
- data = self._get_datasetdict_object()
57
  data_encoded = self._encode_data(data)
58
  data_hidden = self._get_features(data_encoded)
59
  serialize_data(data_hidden, output_path=self.output_path)
 
2
  import pandas as pd
3
  import torch
4
  import torch.nn as nn
 
5
  from sklearn.linear_model import LogisticRegression
6
  from sklearn.metrics import accuracy_score, f1_score
7
  from transformers import AutoModel, AutoTokenizer
8
 
9
+ from .utils import serialize_data, load_data, get_datasetdict_object
10
 
11
 
12
  class PreProcessor:
 
17
  self.df_train = pd.read_csv(train_path, sep="\t")
18
  self.df_test = pd.read_csv(test_path, sep="\t")
19
  self.output_path = output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def _tokenize(self, batch):
22
  return self.tokenizer(batch["tweet"], padding=True)
 
39
  return data_hidden
40
 
41
  def preprocess_data(self):
42
+ data = get_datasetdict_object(self.df_train, self.df_test)
43
  data_encoded = self._encode_data(data)
44
  data_hidden = self._get_features(data_encoded)
45
  serialize_data(data_hidden, output_path=self.output_path)
src/utils.py CHANGED
@@ -2,8 +2,23 @@ import pickle
2
 
3
  import matplotlib.pyplot as plt
4
  import seaborn as sns
5
- from sklearn.metrics import confusion_matrix
6
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  def extract_hidden_state(input_text, tokenizer, language_model):
 
2
 
3
  import matplotlib.pyplot as plt
4
  import seaborn as sns
 
5
  import torch
6
+ from datasets import DatasetDict, Dataset
7
+ from sklearn.metrics import confusion_matrix
8
+
9
+
10
+ def get_datasetdict_object(df_train, df_test):
11
+ mapper = {"#2_tweet": "tweet", "#3_country_label": "label"}
12
+ columns_to_keep = ["tweet", "label"]
13
+
14
+ df_train = df_train.rename(columns=mapper)[columns_to_keep]
15
+ df_test = df_test.rename(columns=mapper)[columns_to_keep]
16
+
17
+ train_dataset = Dataset.from_pandas(df_train)
18
+ test_dataset = Dataset.from_pandas(df_test)
19
+ data = DatasetDict({'train': train_dataset, 'test': test_dataset})
20
+
21
+ return data
22
 
23
 
24
  def extract_hidden_state(input_text, tokenizer, language_model):