Spaces:
Running
Running
Commit
·
81ebdf3
1
Parent(s):
c161b3b
feat: auto-label-embed-cluster
Browse files- data/MMR_CLEAN_EMBEDDINGS.csv +0 -0
- data/MMR_DATA_CLEAN_LABELLED.csv +0 -0
- notebooks/TFIDF.ipynb +1 -1
- notebooks/USE_embedding.ipynb +0 -0
- requirements.txt +8 -0
- src/main.py +13 -5
- utilities/__init__.py +2 -1
- utilities/__pycache__/__init__.cpython-312.pyc +0 -0
- utilities/__pycache__/cluster_label.cpython-312.pyc +0 -0
- utilities/__pycache__/data_cleaner.cpython-312.pyc +0 -0
- utilities/cluster_label.py +35 -0
- utilities/data_cleaner.py +1 -1
data/MMR_CLEAN_EMBEDDINGS.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/MMR_DATA_CLEAN_LABELLED.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/TFIDF.ipynb
CHANGED
@@ -18,7 +18,7 @@
|
|
18 |
"import numpy as np\n",
|
19 |
"import pandas as pd\n",
|
20 |
"import matplotlib.pyplot as plt\n",
|
21 |
-
"import seaborn as sns\n",
|
22 |
"import os\n",
|
23 |
"\n",
|
24 |
"import re\n",
|
|
|
18 |
"import numpy as np\n",
|
19 |
"import pandas as pd\n",
|
20 |
"import matplotlib.pyplot as plt\n",
|
21 |
+
"# import seaborn as sns\n",
|
22 |
"import os\n",
|
23 |
"\n",
|
24 |
"import re\n",
|
notebooks/USE_embedding.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
CHANGED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy
|
2 |
+
pandas
|
3 |
+
matplotlib
|
4 |
+
seaborn
|
5 |
+
scikit-learn
|
6 |
+
umap-learn
|
7 |
+
nltk
|
8 |
+
tensorflow-hub
|
src/main.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1 |
######################################## IMPORTING REQUIRED LIBRARIES ####################################
|
2 |
import os
|
3 |
import sys
|
|
|
4 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
5 |
data_folder = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')
|
6 |
-
from utilities import get_data, input_filter, clean_data
|
7 |
|
8 |
|
9 |
################################################## INPUTS ################################################
|
@@ -26,8 +27,15 @@ def data_clean_for_training(df):
|
|
26 |
|
27 |
if __name__ == '__main__':
|
28 |
|
29 |
-
df = data_sourcing() ## testing the data sourcing endpoint
|
30 |
-
if df:
|
31 |
-
|
32 |
|
33 |
-
clean_df = data_clean_for_training(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
######################################## IMPORTING REQUIRED LIBRARIES ####################################
|
2 |
import os
|
3 |
import sys
|
4 |
+
import pandas as pd
|
5 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
6 |
data_folder = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')
|
7 |
+
from utilities import get_data, input_filter, clean_data, autogenerate_labels
|
8 |
|
9 |
|
10 |
################################################## INPUTS ################################################
|
|
|
27 |
|
28 |
if __name__ == '__main__':
|
29 |
|
30 |
+
# df = data_sourcing() ## testing the data sourcing endpoint
|
31 |
+
# if df:
|
32 |
+
# print("Data loaded successfully !!")
|
33 |
|
34 |
+
# clean_df = data_clean_for_training(df)
|
35 |
+
|
36 |
+
df = pd.read_csv(f'{data_folder}/MMR_DATA.csv')
|
37 |
+
df = clean_data(df)
|
38 |
+
labelled_df, embeddings_df = autogenerate_labels(df)
|
39 |
+
|
40 |
+
labelled_df.to_csv(f'{data_folder}/MMR_DATA_CLEAN_LABELLED.csv', index=False)
|
41 |
+
embeddings_df.to_csv(f'{data_folder}/MMR_CLEAN_EMBEDDINGS.csv', index=False)
|
utilities/__init__.py
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
from .data_loader import get_data, input_filter
|
2 |
-
from .data_cleaner import clean_data
|
|
|
|
1 |
from .data_loader import get_data, input_filter
|
2 |
+
from .data_cleaner import clean_data
|
3 |
+
from .cluster_label import autogenerate_labels
|
utilities/__pycache__/__init__.cpython-312.pyc
CHANGED
Binary files a/utilities/__pycache__/__init__.cpython-312.pyc and b/utilities/__pycache__/__init__.cpython-312.pyc differ
|
|
utilities/__pycache__/cluster_label.cpython-312.pyc
ADDED
Binary file (1.76 kB). View file
|
|
utilities/__pycache__/data_cleaner.cpython-312.pyc
ADDED
Binary file (1.79 kB). View file
|
|
utilities/cluster_label.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as numpy
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import tensorflow_hub as hub
|
5 |
+
from sklearn.cluster import KMeans
|
6 |
+
from sklearn.preprocessing import StandardScaler
|
7 |
+
|
8 |
+
|
9 |
+
def embed(input):
|
10 |
+
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
|
11 |
+
model = hub.load(module_url)
|
12 |
+
return model(input)
|
13 |
+
|
14 |
+
def generate_use_embeddings(data):
|
15 |
+
embeddings = embed(data)
|
16 |
+
embeddings = np.array(embeddings).tolist()
|
17 |
+
return embeddings
|
18 |
+
|
19 |
+
def autogenerate_labels(df):
|
20 |
+
map_data = df['Map Data'].to_numpy()
|
21 |
+
|
22 |
+
embeddings_list = generate_use_embeddings(map_data)
|
23 |
+
np_embeddings = np.array(embeddings_list)
|
24 |
+
df_embeddings = pd.DataFrame(np_embeddings)
|
25 |
+
scaler = StandardScaler()
|
26 |
+
scaled_embeddings = scaler.fit_transform(np_embeddings)
|
27 |
+
|
28 |
+
n_clusters = 4
|
29 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
30 |
+
kmeans.fit(scaled_embeddings)
|
31 |
+
|
32 |
+
y_kmeans = kmeans.labels_
|
33 |
+
|
34 |
+
df['label'] = y_kmeans + 1
|
35 |
+
return df, df_embeddings
|
utilities/data_cleaner.py
CHANGED
@@ -22,6 +22,6 @@ def clean_text(text):
|
|
22 |
def clean_data(df):
|
23 |
df['Map Data'] = df['Map Data'].fillna('')
|
24 |
df = df[df['Map Data'].str.len() > 0]
|
25 |
-
df = df[df['Map Data'].str.len() <
|
26 |
# df['Map Data'] = df['Map Data'].apply(clean_text)
|
27 |
return df
|
|
|
22 |
def clean_data(df):
|
23 |
df['Map Data'] = df['Map Data'].fillna('')
|
24 |
df = df[df['Map Data'].str.len() > 0]
|
25 |
+
df = df[df['Map Data'].str.len() < 10000]
|
26 |
# df['Map Data'] = df['Map Data'].apply(clean_text)
|
27 |
return df
|