bluuebunny commited on
Commit
85dfa22
·
verified ·
1 Parent(s): c5b8946

Update update_embeddings.py

Browse files
Files changed (1) hide show
  1. update_embeddings.py +26 -11
update_embeddings.py CHANGED
@@ -11,7 +11,6 @@ from sentence_transformers import SentenceTransformer # For embedding the text
11
  import torch # For gpu
12
  import pandas as pd # Data manipulation
13
  from huggingface_hub import snapshot_download # Download previous embeddings
14
- import json # To make milvus compatible $meta
15
  import os # Folder and file creation
16
  from tqdm import tqdm # Progress bar
17
  tqdm.pandas() # Progress bar for pandas
@@ -45,6 +44,14 @@ UPLOAD = True
45
  # Flag to binarise the data
46
  BINARY = True
47
 
 
 
 
 
 
 
 
 
48
  ########################################
49
 
50
  # Model to use for embedding
@@ -241,36 +248,44 @@ print(f"Creating new embeddings for: {num_new_papers} entries")
241
  new_papers["vector"] = new_papers["abstract"].progress_apply(embed)
242
 
243
  ####################
 
 
244
  # Add URL column
245
- arxiv_metadata_split['url'] = 'https://arxiv.org/abs/' + arxiv_metadata_split['id']
246
 
247
  # Add month column
248
- arxiv_metadata_split['month'] = arxiv_metadata_split['id'].progress_apply(extract_month_year, what='month')
249
 
250
  ####################
 
 
251
  # Remove newline characters from authors, title, abstract and categories columns
252
- arxiv_metadata_split['title'] = arxiv_metadata_split['title'].astype(str).str.replace('\n', ' ', regex=False)
253
 
254
- arxiv_metadata_split['authors'] = arxiv_metadata_split['authors'].astype(str).str.replace('\n', ' ', regex=False)
255
 
256
- arxiv_metadata_split['categories'] = arxiv_metadata_split['categories'].astype(str).str.replace('\n', ' ', regex=False)
257
 
258
- arxiv_metadata_split['abstract'] = arxiv_metadata_split['abstract'].astype(str).str.replace('\n', ' ', regex=False)
259
 
260
  ####################
 
 
261
  # Trim title to 512 characters
262
- arxiv_metadata_split['title'] = arxiv_metadata_split['title'].progress_apply(lambda x: x[:508] + '...' if len(x) > 512 else x)
263
 
264
  # Trim categories to 128 characters
265
- arxiv_metadata_split['categories'] = arxiv_metadata_split['categories'].progress_apply(lambda x: x[:124] + '...' if len(x) > 128 else x)
266
 
267
  # Trim authors to 128 characters
268
- arxiv_metadata_split['authors'] = arxiv_metadata_split['authors'].progress_apply(lambda x: x[:124] + '...' if len(x) > 128 else x)
269
 
270
  # Trim abstract to 3072 characters
271
- arxiv_metadata_split['abstract'] = arxiv_metadata_split['abstract'].progress_apply(lambda x: x[:3068] + '...' if len(x) > 3072 else x)
272
 
273
  ####################
 
 
274
  # Selecting id, vector and $meta to retain
275
  selected_columns = ['id', 'vector', 'title', 'abstract', 'authors', 'categories', 'month', 'year', 'url']
276
 
 
11
  import torch # For gpu
12
  import pandas as pd # Data manipulation
13
  from huggingface_hub import snapshot_download # Download previous embeddings
 
14
  import os # Folder and file creation
15
  from tqdm import tqdm # Progress bar
16
  tqdm.pandas() # Progress bar for pandas
 
44
  # Flag to binarise the data
45
  BINARY = True
46
 
47
+ # Print the configuration
48
+ print(f'Configuration:')
49
+ print(f'Year: {year}')
50
+ print(f'Force: {FORCE}')
51
+ print(f'Local: {LOCAL}')
52
+ print(f'Upload: {UPLOAD}')
53
+ print(f'Binary: {BINARY}')
54
+
55
  ########################################
56
 
57
  # Model to use for embedding
 
248
  new_papers["vector"] = new_papers["abstract"].progress_apply(embed)
249
 
250
  ####################
251
+ print("Adding url and month columns")
252
+
253
  # Add URL column
254
+ new_papers['url'] = 'https://arxiv.org/abs/' + new_papers['id']
255
 
256
  # Add month column
257
+ new_papers['month'] = new_papers['id'].progress_apply(extract_month_year, what='month')
258
 
259
  ####################
260
+ print("Removing newline characters from title, authors, categories, abstract")
261
+
262
  # Remove newline characters from authors, title, abstract and categories columns
263
+ new_papers['title'] = new_papers['title'].astype(str).str.replace('\n', ' ', regex=False)
264
 
265
+ new_papers['authors'] = new_papers['authors'].astype(str).str.replace('\n', ' ', regex=False)
266
 
267
+ new_papers['categories'] = new_papers['categories'].astype(str).str.replace('\n', ' ', regex=False)
268
 
269
+ new_papers['abstract'] = new_papers['abstract'].astype(str).str.replace('\n', ' ', regex=False)
270
 
271
  ####################
272
+ print("Trimming title, authors, categories, abstract")
273
+
274
  # Trim title to 512 characters
275
+ new_papers['title'] = new_papers['title'].progress_apply(lambda x: x[:508] + '...' if len(x) > 512 else x)
276
 
277
  # Trim categories to 128 characters
278
+ new_papers['categories'] = new_papers['categories'].progress_apply(lambda x: x[:124] + '...' if len(x) > 128 else x)
279
 
280
  # Trim authors to 128 characters
281
+ new_papers['authors'] = new_papers['authors'].progress_apply(lambda x: x[:124] + '...' if len(x) > 128 else x)
282
 
283
  # Trim abstract to 3072 characters
284
+ new_papers['abstract'] = new_papers['abstract'].progress_apply(lambda x: x[:3068] + '...' if len(x) > 3072 else x)
285
 
286
  ####################
287
+ print("Concatenating previouly embedded dataframe with new embeddings")
288
+
289
  # Selecting id, vector and $meta to retain
290
  selected_columns = ['id', 'vector', 'title', 'abstract', 'authors', 'categories', 'month', 'year', 'url']
291