Spaces:

bluuebunny
/

update_arxiv_embeddings

Paused

App Files Files Community

bluuebunny commited on Dec 5, 2024

Commit

539851c

verified ·

1 Parent(s): a8dedcf

Update update_embeddings.py

Browse files

Files changed (1) hide show

update_embeddings.py +80 -25

update_embeddings.py CHANGED Viewed

@@ -16,11 +16,13 @@ import os # Folder and file creation
 from tqdm import tqdm # Progress bar
 tqdm.pandas() # Progress bar for pandas
 from mixedbread_ai.client import MixedbreadAI # For embedding the text
 import numpy as np # For array manipulation
 from huggingface_hub import HfApi # To transact with huggingface.co
 import sys # To quit the script
 import datetime # get current year
 from time import time, sleep # To time the script
 # Start timer
 start = time()
@@ -57,6 +59,12 @@ num_cores = cpu_count()-1
 # Setup transaction details
 repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus"
 ################################################################################
 # Download the dataset
@@ -90,28 +98,35 @@ else:
 # https://huggingface.co/docs/datasets/en/about_arrow#memory-mapping
 # Load metadata
 print(f"Loading json metadata")
-arxiv_metadata_all = load_dataset("json", data_files= str(f"{download_file}"))
-########################################
-# Function to add year to metadata
-def add_year(example):
-    example['year'] = example['id'].split('/')[1][:2] if '/' in example['id'] else example['id'][:2]
-    return example
 ########################################
 # Add year to metadata
 print(f"Adding year to metadata")
-arxiv_metadata_all = arxiv_metadata_all.map(add_year, num_proc=num_cores)
 # Filter by year
 print(f"Filtering metadata by year: {year}")
-arxiv_metadata_all = arxiv_metadata_all.filter(lambda example: example['year'] == year, num_proc=num_cores)
-# Convert to pandas
-print(f"Loading metadata for year: {year} into pandas")
-arxiv_metadata_split = arxiv_metadata_all['train'].to_pandas()
 ################################################################################
 # Load Model
@@ -131,8 +146,13 @@ if LOCAL:
 else:
     print("Setting up mxbai API client")
     print("To use local resources, set LOCAL = True")
     # Setup mxbai
-    mxbai_api_key = os.getenv("MXBAI_API_KEY")
     mxbai = MixedbreadAI(api_key=mxbai_api_key)
 ########################################
@@ -142,10 +162,13 @@ def embed(input_text):
     if LOCAL:
         # Calculate embeddings by calling model.encode(), specifying the device
-        embedding = model.encode(input_text, device=device)
-    else:
         # Avoid rate limit from api
         sleep(0.2)
@@ -158,7 +181,8 @@ def embed(input_text):
         truncation_strategy='end'
         )
-        embedding = np.array(result.data[0].embedding)
     return embedding
 ########################################
@@ -203,6 +227,9 @@ except Exception as e:
 # Find papers that are not in the previous embeddings
 new_papers = arxiv_metadata_split[~arxiv_metadata_split['id'].isin(previous_embeddings['id'])]
 # Number of new papers
 num_new_papers = len(new_papers)
@@ -216,17 +243,39 @@ if num_new_papers == 0:
 print(f"Creating new embeddings for: {num_new_papers} entries")
 new_papers["vector"] = new_papers["abstract"].progress_apply(embed)
-# Rename columns
-new_papers.rename(columns={'title': 'Title', 'authors': 'Authors', 'abstract': 'Abstract'}, inplace=True)
 # Add URL column
-new_papers['URL'] = 'https://arxiv.org/abs/' + new_papers['id']
-# Create milvus compatible parquet file, $meta is a json string of the metadata
-new_papers['$meta'] = new_papers[['Title', 'Authors', 'Abstract', 'URL']].apply(lambda row: json.dumps(row.to_dict()), axis=1)
 # Selecting id, vector and $meta to retain
-selected_columns = ['id', 'vector', '$meta']
 # Merge previous embeddings and new embeddings
 new_embeddings = pd.concat([previous_embeddings, new_papers[selected_columns]])
@@ -248,7 +297,13 @@ new_embeddings.to_parquet(embed_filename, index=False)
 if UPLOAD:
     print(f"Uploading new embeddings to: {repo_id}")
-    access_token =  os.getenv("HF_API_KEY")
     api = HfApi(token=access_token)
     # Upload all files within the folder to the specified repository

 from tqdm import tqdm # Progress bar
 tqdm.pandas() # Progress bar for pandas
 from mixedbread_ai.client import MixedbreadAI # For embedding the text
+from dotenv import dotenv_values # To load environment variables
 import numpy as np # For array manipulation
 from huggingface_hub import HfApi # To transact with huggingface.co
 import sys # To quit the script
 import datetime # get current year
 from time import time, sleep # To time the script
+from datetime import datetime # To get the current date and time
 # Start timer
 start = time()
 # Setup transaction details
 repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus"
+# Import secrets
+config = dotenv_values(".env")
+def is_running_in_huggingface_space():
+    return "SPACE_ID" in os.environ
 ################################################################################
 # Download the dataset
 # https://huggingface.co/docs/datasets/en/about_arrow#memory-mapping
 # Load metadata
 print(f"Loading json metadata")
+dataset = load_dataset("json", data_files= str(f"{download_file}"))
+# Split metadata by year
+# Convert to pandas
+print(f"Converting metadata into pandas")
+arxiv_metadata_all = dataset['train'].to_pandas()
+########################################
+# Function to extract year from arxiv id
+# https://info.arxiv.org/help/arxiv_identifier.html
+# Function to extract Month and year of publication using arxiv ID
+def extract_month_year(arxiv_id, what='month'):
+    # Identify the relevant YYMM part based on the arXiv ID format
+    yymm = arxiv_id.split('/')[-1][:4] if '/' in arxiv_id else arxiv_id.split('.')[0]
+    # Convert the year-month string to a datetime object
+    date = datetime.strptime(yymm, '%y%m')
+    # Return the desired part based on the input parameter
+    return date.strftime('%B') if what == 'month' else date.strftime('%Y')
 ########################################
 # Add year to metadata
 print(f"Adding year to metadata")
+arxiv_metadata_all['year'] =  arxiv_metadata_all['id'].progress_apply(extract_month_year, what='year')
 # Filter by year
 print(f"Filtering metadata by year: {year}")
+arxiv_metadata_split = arxiv_metadata_all[arxiv_metadata_all['year'] == year]
 ################################################################################
 # Load Model
 else:
     print("Setting up mxbai API client")
     print("To use local resources, set LOCAL = True")
     # Setup mxbai
+    if is_running_in_huggingface_space():
+        mxbai_api_key = os.getenv("MXBAI_API_KEY")
+    else:
+        mxbai_api_key = config["MXBAI_API_KEY"]
     mxbai = MixedbreadAI(api_key=mxbai_api_key)
 ########################################
     if LOCAL:
         # Calculate embeddings by calling model.encode(), specifying the device
+        embedding = model.encode(input_text, device=device, precision="float32")
+        # Enforce 32-bit float precision
+        embedding = np.array(embedding, dtype=np.float32)
+    else:
         # Avoid rate limit from api
         sleep(0.2)
         truncation_strategy='end'
         )
+        # Enforce 32-bit float precision
+        embedding = np.array(result.data[0].embedding, dtype=np.float32)
     return embedding
 ########################################
 # Find papers that are not in the previous embeddings
 new_papers = arxiv_metadata_split[~arxiv_metadata_split['id'].isin(previous_embeddings['id'])]
+# Drop duplicates based on the 'id' column
+new_papers = new_papers.drop_duplicates(subset='id', keep='last', ignore_index=True)
 # Number of new papers
 num_new_papers = len(new_papers)
 print(f"Creating new embeddings for: {num_new_papers} entries")
 new_papers["vector"] = new_papers["abstract"].progress_apply(embed)
+####################
 # Add URL column
+arxiv_metadata_split['url'] = 'https://arxiv.org/abs/' + arxiv_metadata_split['id']
+# Add month column
+arxiv_metadata_split['month'] = arxiv_metadata_split['id'].progress_apply(extract_month_year, what='month')
+####################
+# Remove newline characters from authors, title, abstract and categories columns
+arxiv_metadata_split['title'] = arxiv_metadata_split['title'].astype(str).str.replace('\n', ' ', regex=False)
+arxiv_metadata_split['authors'] = arxiv_metadata_split['authors'].astype(str).str.replace('\n', ' ', regex=False)
+arxiv_metadata_split['categories'] = arxiv_metadata_split['categories'].astype(str).str.replace('\n', ' ', regex=False)
+arxiv_metadata_split['abstract'] = arxiv_metadata_split['abstract'].astype(str).str.replace('\n', ' ', regex=False)
+####################
+# Trim title to 512 characters
+arxiv_metadata_split['title'] = arxiv_metadata_split['title'].progress_apply(lambda x: x[:508] + '...' if len(x) > 512 else x)
+# Trim categories to 128 characters
+arxiv_metadata_split['categories'] = arxiv_metadata_split['categories'].progress_apply(lambda x: x[:124] + '...' if len(x) > 128 else x)
+# Trim authors to 128 characters
+arxiv_metadata_split['authors'] = arxiv_metadata_split['authors'].progress_apply(lambda x: x[:124] + '...' if len(x) > 128 else x)
+# Trim abstract to 3072 characters
+arxiv_metadata_split['abstract'] = arxiv_metadata_split['abstract'].progress_apply(lambda x: x[:3068] + '...' if len(x) > 3072 else x)
+####################
 # Selecting id, vector and $meta to retain
+selected_columns = ['id', 'vector', 'title', 'abstract', 'authors', 'categories', 'month', 'year', 'url']
 # Merge previous embeddings and new embeddings
 new_embeddings = pd.concat([previous_embeddings, new_papers[selected_columns]])
 if UPLOAD:
     print(f"Uploading new embeddings to: {repo_id}")
+    # Setup Hugging Face API
+    if is_running_in_huggingface_space():
+        access_token = os.getenv("HF_API_KEY")
+    else:
+        access_token =  config["HF_API_KEY"]
     api = HfApi(token=access_token)
     # Upload all files within the folder to the specified repository