akhil-vaidya commited on
Commit
81ebdf3
·
1 Parent(s): c161b3b

feat: auto-label-embed-cluster

Browse files
data/MMR_CLEAN_EMBEDDINGS.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/MMR_DATA_CLEAN_LABELLED.csv ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/TFIDF.ipynb CHANGED
@@ -18,7 +18,7 @@
18
  "import numpy as np\n",
19
  "import pandas as pd\n",
20
  "import matplotlib.pyplot as plt\n",
21
- "import seaborn as sns\n",
22
  "import os\n",
23
  "\n",
24
  "import re\n",
 
18
  "import numpy as np\n",
19
  "import pandas as pd\n",
20
  "import matplotlib.pyplot as plt\n",
21
+ "# import seaborn as sns\n",
22
  "import os\n",
23
  "\n",
24
  "import re\n",
notebooks/USE_embedding.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ matplotlib
4
+ seaborn
5
+ scikit-learn
6
+ umap-learn
7
+ nltk
8
+ tensorflow-hub
src/main.py CHANGED
@@ -1,9 +1,10 @@
1
  ######################################## IMPORTING REQUIRED LIBRARIES ####################################
2
  import os
3
  import sys
 
4
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
5
  data_folder = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')
6
- from utilities import get_data, input_filter, clean_data
7
 
8
 
9
  ################################################## INPUTS ################################################
@@ -26,8 +27,15 @@ def data_clean_for_training(df):
26
 
27
  if __name__ == '__main__':
28
 
29
- df = data_sourcing() ## testing the data sourcing endpoint
30
- if df:
31
- print("Data loaded successfully !!")
32
 
33
- clean_df = data_clean_for_training(df)
 
 
 
 
 
 
 
 
1
  ######################################## IMPORTING REQUIRED LIBRARIES ####################################
2
  import os
3
  import sys
4
+ import pandas as pd
5
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
6
  data_folder = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')
7
+ from utilities import get_data, input_filter, clean_data, autogenerate_labels
8
 
9
 
10
  ################################################## INPUTS ################################################
 
27
 
28
  if __name__ == '__main__':
29
 
30
+ # df = data_sourcing() ## testing the data sourcing endpoint
31
+ # if df:
32
+ # print("Data loaded successfully !!")
33
 
34
+ # clean_df = data_clean_for_training(df)
35
+
36
+ df = pd.read_csv(f'{data_folder}/MMR_DATA.csv')
37
+ df = clean_data(df)
38
+ labelled_df, embeddings_df = autogenerate_labels(df)
39
+
40
+ labelled_df.to_csv(f'{data_folder}/MMR_DATA_CLEAN_LABELLED.csv', index=False)
41
+ embeddings_df.to_csv(f'{data_folder}/MMR_CLEAN_EMBEDDINGS.csv', index=False)
utilities/__init__.py CHANGED
@@ -1,2 +1,3 @@
1
  from .data_loader import get_data, input_filter
2
- from .data_cleaner import clean_data
 
 
1
  from .data_loader import get_data, input_filter
2
+ from .data_cleaner import clean_data
3
+ from .cluster_label import autogenerate_labels
utilities/__pycache__/__init__.cpython-312.pyc CHANGED
Binary files a/utilities/__pycache__/__init__.cpython-312.pyc and b/utilities/__pycache__/__init__.cpython-312.pyc differ
 
utilities/__pycache__/cluster_label.cpython-312.pyc ADDED
Binary file (1.76 kB). View file
 
utilities/__pycache__/data_cleaner.cpython-312.pyc ADDED
Binary file (1.79 kB). View file
 
utilities/cluster_label.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as numpy
2
+ import pandas as pd
3
+ import numpy as np
4
+ import tensorflow_hub as hub
5
+ from sklearn.cluster import KMeans
6
+ from sklearn.preprocessing import StandardScaler
7
+
8
+
9
+ def embed(input):
10
+ module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
11
+ model = hub.load(module_url)
12
+ return model(input)
13
+
14
+ def generate_use_embeddings(data):
15
+ embeddings = embed(data)
16
+ embeddings = np.array(embeddings).tolist()
17
+ return embeddings
18
+
19
+ def autogenerate_labels(df):
20
+ map_data = df['Map Data'].to_numpy()
21
+
22
+ embeddings_list = generate_use_embeddings(map_data)
23
+ np_embeddings = np.array(embeddings_list)
24
+ df_embeddings = pd.DataFrame(np_embeddings)
25
+ scaler = StandardScaler()
26
+ scaled_embeddings = scaler.fit_transform(np_embeddings)
27
+
28
+ n_clusters = 4
29
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
30
+ kmeans.fit(scaled_embeddings)
31
+
32
+ y_kmeans = kmeans.labels_
33
+
34
+ df['label'] = y_kmeans + 1
35
+ return df, df_embeddings
utilities/data_cleaner.py CHANGED
@@ -22,6 +22,6 @@ def clean_text(text):
22
  def clean_data(df):
23
  df['Map Data'] = df['Map Data'].fillna('')
24
  df = df[df['Map Data'].str.len() > 0]
25
- df = df[df['Map Data'].str.len() < 5000]
26
  # df['Map Data'] = df['Map Data'].apply(clean_text)
27
  return df
 
22
  def clean_data(df):
23
  df['Map Data'] = df['Map Data'].fillna('')
24
  df = df[df['Map Data'].str.len() > 0]
25
+ df = df[df['Map Data'].str.len() < 10000]
26
  # df['Map Data'] = df['Map Data'].apply(clean_text)
27
  return df