Update update_embeddings.py
Browse files- update_embeddings.py +10 -1
update_embeddings.py
CHANGED
@@ -174,8 +174,17 @@ previous_embeddings = pd.read_parquet(previous_embed)
|
|
174 |
# Find papers that are not in the previous embeddings
|
175 |
new_papers = arxiv_metadata_split[~arxiv_metadata_split['id'].isin(previous_embeddings['id'])]
|
176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
# Create a column for embeddings
|
178 |
-
print(f"Creating new embeddings for: {
|
179 |
new_papers["vector"] = new_papers["abstract"].progress_apply(embed)
|
180 |
|
181 |
# Rename columns
|
|
|
174 |
# Find papers that are not in the previous embeddings
|
175 |
new_papers = arxiv_metadata_split[~arxiv_metadata_split['id'].isin(previous_embeddings['id'])]
|
176 |
|
177 |
+
# Number of new papers
|
178 |
+
num_new_papers = len(new_papers)
|
179 |
+
|
180 |
+
# What if there are no new papers?
|
181 |
+
if num_new_papers == 0:
|
182 |
+
print(f"No new papers found for year: {year}")
|
183 |
+
print("Exiting")
|
184 |
+
sys.exit()
|
185 |
+
|
186 |
# Create a column for embeddings
|
187 |
+
print(f"Creating new embeddings for: {num_new_papers} entries")
|
188 |
new_papers["vector"] = new_papers["abstract"].progress_apply(embed)
|
189 |
|
190 |
# Rename columns
|