Update update_embeddings.py
Browse files- update_embeddings.py +26 -11
update_embeddings.py
CHANGED
@@ -11,7 +11,6 @@ from sentence_transformers import SentenceTransformer # For embedding the text
|
|
11 |
import torch # For gpu
|
12 |
import pandas as pd # Data manipulation
|
13 |
from huggingface_hub import snapshot_download # Download previous embeddings
|
14 |
-
import json # To make milvus compatible $meta
|
15 |
import os # Folder and file creation
|
16 |
from tqdm import tqdm # Progress bar
|
17 |
tqdm.pandas() # Progress bar for pandas
|
@@ -45,6 +44,14 @@ UPLOAD = True
|
|
45 |
# Flag to binarise the data
|
46 |
BINARY = True
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
########################################
|
49 |
|
50 |
# Model to use for embedding
|
@@ -241,36 +248,44 @@ print(f"Creating new embeddings for: {num_new_papers} entries")
|
|
241 |
new_papers["vector"] = new_papers["abstract"].progress_apply(embed)
|
242 |
|
243 |
####################
|
|
|
|
|
244 |
# Add URL column
|
245 |
-
|
246 |
|
247 |
# Add month column
|
248 |
-
|
249 |
|
250 |
####################
|
|
|
|
|
251 |
# Remove newline characters from authors, title, abstract and categories columns
|
252 |
-
|
253 |
|
254 |
-
|
255 |
|
256 |
-
|
257 |
|
258 |
-
|
259 |
|
260 |
####################
|
|
|
|
|
261 |
# Trim title to 512 characters
|
262 |
-
|
263 |
|
264 |
# Trim categories to 128 characters
|
265 |
-
|
266 |
|
267 |
# Trim authors to 128 characters
|
268 |
-
|
269 |
|
270 |
# Trim abstract to 3072 characters
|
271 |
-
|
272 |
|
273 |
####################
|
|
|
|
|
274 |
# Selecting id, vector and $meta to retain
|
275 |
selected_columns = ['id', 'vector', 'title', 'abstract', 'authors', 'categories', 'month', 'year', 'url']
|
276 |
|
|
|
11 |
import torch # For gpu
|
12 |
import pandas as pd # Data manipulation
|
13 |
from huggingface_hub import snapshot_download # Download previous embeddings
|
|
|
14 |
import os # Folder and file creation
|
15 |
from tqdm import tqdm # Progress bar
|
16 |
tqdm.pandas() # Progress bar for pandas
|
|
|
44 |
# Flag to binarise the data
|
45 |
BINARY = True
|
46 |
|
47 |
+
# Print the configuration
|
48 |
+
print(f'Configuration:')
|
49 |
+
print(f'Year: {year}')
|
50 |
+
print(f'Force: {FORCE}')
|
51 |
+
print(f'Local: {LOCAL}')
|
52 |
+
print(f'Upload: {UPLOAD}')
|
53 |
+
print(f'Binary: {BINARY}')
|
54 |
+
|
55 |
########################################
|
56 |
|
57 |
# Model to use for embedding
|
|
|
248 |
new_papers["vector"] = new_papers["abstract"].progress_apply(embed)
|
249 |
|
250 |
####################
|
251 |
+
print("Adding url and month columns")
|
252 |
+
|
253 |
# Add URL column
|
254 |
+
new_papers['url'] = 'https://arxiv.org/abs/' + new_papers['id']
|
255 |
|
256 |
# Add month column
|
257 |
+
new_papers['month'] = new_papers['id'].progress_apply(extract_month_year, what='month')
|
258 |
|
259 |
####################
|
260 |
+
print("Removing newline characters from title, authors, categories, abstract")
|
261 |
+
|
262 |
# Remove newline characters from authors, title, abstract and categories columns
|
263 |
+
new_papers['title'] = new_papers['title'].astype(str).str.replace('\n', ' ', regex=False)
|
264 |
|
265 |
+
new_papers['authors'] = new_papers['authors'].astype(str).str.replace('\n', ' ', regex=False)
|
266 |
|
267 |
+
new_papers['categories'] = new_papers['categories'].astype(str).str.replace('\n', ' ', regex=False)
|
268 |
|
269 |
+
new_papers['abstract'] = new_papers['abstract'].astype(str).str.replace('\n', ' ', regex=False)
|
270 |
|
271 |
####################
|
272 |
+
print("Trimming title, authors, categories, abstract")
|
273 |
+
|
274 |
# Trim title to 512 characters
|
275 |
+
new_papers['title'] = new_papers['title'].progress_apply(lambda x: x[:508] + '...' if len(x) > 512 else x)
|
276 |
|
277 |
# Trim categories to 128 characters
|
278 |
+
new_papers['categories'] = new_papers['categories'].progress_apply(lambda x: x[:124] + '...' if len(x) > 128 else x)
|
279 |
|
280 |
# Trim authors to 128 characters
|
281 |
+
new_papers['authors'] = new_papers['authors'].progress_apply(lambda x: x[:124] + '...' if len(x) > 128 else x)
|
282 |
|
283 |
# Trim abstract to 3072 characters
|
284 |
+
new_papers['abstract'] = new_papers['abstract'].progress_apply(lambda x: x[:3068] + '...' if len(x) > 3072 else x)
|
285 |
|
286 |
####################
|
287 |
+
print("Concatenating previouly embedded dataframe with new embeddings")
|
288 |
+
|
289 |
# Selecting id, vector and $meta to retain
|
290 |
selected_columns = ['id', 'vector', 'title', 'abstract', 'authors', 'categories', 'month', 'year', 'url']
|
291 |
|