Spaces:
Running
on
Zero
Running
on
Zero
add a better similarity score calculation
Browse files
app.py
CHANGED
|
@@ -300,7 +300,12 @@ df = pd.read_excel('perfume_database_cleaned.xlsx')
|
|
| 300 |
def extract_notes_for_comparison(data: Union[str, dict]) -> list[str]:
|
| 301 |
"""
|
| 302 |
Extracts all notes from the Olfactory Pyramid section of a JSON string or dict.
|
| 303 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
"""
|
| 305 |
if isinstance(data, str):
|
| 306 |
try:
|
|
@@ -311,86 +316,51 @@ def extract_notes_for_comparison(data: Union[str, dict]) -> list[str]:
|
|
| 311 |
if not isinstance(data, dict):
|
| 312 |
raise TypeError("Input must be a dict or a valid JSON string")
|
| 313 |
|
| 314 |
-
olfactory_pyramid = data.get("Olfactory Pyramid")
|
| 315 |
if not olfactory_pyramid:
|
| 316 |
-
|
| 317 |
|
| 318 |
notes = []
|
| 319 |
for layer in ["Top Notes", "Heart Notes", "Base Notes"]:
|
| 320 |
-
layer_data = olfactory_pyramid.get(layer)
|
| 321 |
if not layer_data:
|
| 322 |
continue
|
| 323 |
for item in layer_data:
|
| 324 |
-
note = item.get("note")
|
| 325 |
if note:
|
| 326 |
notes.append(note.strip())
|
| 327 |
|
|
|
|
|
|
|
|
|
|
| 328 |
return notes
|
| 329 |
|
| 330 |
-
|
| 331 |
from rapidfuzz import fuzz
|
| 332 |
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
user_notes = extract_notes_for_comparison(data)
|
| 339 |
-
if not user_notes:
|
| 340 |
-
return pd.DataFrame([{
|
| 341 |
-
'brand': 'N/A',
|
| 342 |
-
'perfume': 'N/A',
|
| 343 |
-
'matching_notes': 'No notes found in input',
|
| 344 |
-
'match_count': 0,
|
| 345 |
-
'similarity_score': 0
|
| 346 |
-
}])
|
| 347 |
-
|
| 348 |
-
# Lowercase user notes
|
| 349 |
-
user_notes_clean = [n.strip().lower() for n in user_notes]
|
| 350 |
-
|
| 351 |
-
matches = []
|
| 352 |
-
for _, row in df.iterrows():
|
| 353 |
-
perfume_notes = [n.strip().lower() for n in row['notes'].split(',')]
|
| 354 |
-
matched_notes = []
|
| 355 |
-
total_ratio = 0
|
| 356 |
-
|
| 357 |
-
for u_note in user_notes_clean:
|
| 358 |
-
best_p_note = None
|
| 359 |
-
best_ratio = 0
|
| 360 |
-
for p_note in perfume_notes:
|
| 361 |
-
ratio = fuzz.partial_ratio(u_note, p_note)
|
| 362 |
-
if ratio > best_ratio:
|
| 363 |
-
best_ratio = ratio
|
| 364 |
-
best_p_note = p_note
|
| 365 |
-
if best_ratio >= threshold:
|
| 366 |
-
matched_notes.append(best_p_note)
|
| 367 |
-
total_ratio += best_ratio
|
| 368 |
-
|
| 369 |
-
matches.append({
|
| 370 |
-
'brand': row['brand'],
|
| 371 |
-
'perfume': row['perfume'],
|
| 372 |
-
'matching_notes': ', '.join(sorted(set(matched_notes))),
|
| 373 |
-
'match_count': len(set(matched_notes)),
|
| 374 |
-
'similarity_score': total_ratio
|
| 375 |
})
|
| 376 |
|
| 377 |
result = pd.DataFrame(matches)
|
| 378 |
result = result[result['match_count'] > 0]
|
|
|
|
| 379 |
if result.empty:
|
| 380 |
return pd.DataFrame([{
|
| 381 |
'brand': 'N/A',
|
| 382 |
'perfume': 'N/A',
|
| 383 |
-
'matching_notes': 'No matching
|
| 384 |
'match_count': 0,
|
| 385 |
-
'similarity_score': 0
|
|
|
|
|
|
|
| 386 |
}])
|
| 387 |
|
| 388 |
-
result = result.sort_values(by=['
|
| 389 |
-
result = result.head(top_n).reset_index(drop=True)
|
| 390 |
|
| 391 |
return result
|
| 392 |
-
|
| 393 |
-
|
| 394 |
|
| 395 |
def infer(image_input):
|
| 396 |
|
|
|
|
| 300 |
def extract_notes_for_comparison(data: Union[str, dict]) -> list[str]:
|
| 301 |
"""
|
| 302 |
Extracts all notes from the Olfactory Pyramid section of a JSON string or dict.
|
| 303 |
+
|
| 304 |
+
Args:
|
| 305 |
+
data (Union[str, dict]): The JSON string or Python dict.
|
| 306 |
+
|
| 307 |
+
Returns:
|
| 308 |
+
list[str]: A list of extracted note names.
|
| 309 |
"""
|
| 310 |
if isinstance(data, str):
|
| 311 |
try:
|
|
|
|
| 316 |
if not isinstance(data, dict):
|
| 317 |
raise TypeError("Input must be a dict or a valid JSON string")
|
| 318 |
|
| 319 |
+
olfactory_pyramid = data.get("Olfactory Pyramid")
|
| 320 |
if not olfactory_pyramid:
|
| 321 |
+
raise KeyError("No 'Olfactory Pyramid' found in the data")
|
| 322 |
|
| 323 |
notes = []
|
| 324 |
for layer in ["Top Notes", "Heart Notes", "Base Notes"]:
|
| 325 |
+
layer_data = olfactory_pyramid.get(layer)
|
| 326 |
if not layer_data:
|
| 327 |
continue
|
| 328 |
for item in layer_data:
|
| 329 |
+
note = item.get("note")
|
| 330 |
if note:
|
| 331 |
notes.append(note.strip())
|
| 332 |
|
| 333 |
+
if not notes:
|
| 334 |
+
raise ValueError("No notes found in the Olfactory Pyramid")
|
| 335 |
+
|
| 336 |
return notes
|
| 337 |
|
|
|
|
| 338 |
from rapidfuzz import fuzz
|
| 339 |
|
| 340 |
+
notes': ', '.join(unique_matched_notes),
|
| 341 |
+
'match_count': match_count,
|
| 342 |
+
'similarity_score': round(total_ratio, 2),
|
| 343 |
+
'purity': round(purity, 2),
|
| 344 |
+
'adjusted_score': round(adjusted_score, 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
})
|
| 346 |
|
| 347 |
result = pd.DataFrame(matches)
|
| 348 |
result = result[result['match_count'] > 0]
|
| 349 |
+
|
| 350 |
if result.empty:
|
| 351 |
return pd.DataFrame([{
|
| 352 |
'brand': 'N/A',
|
| 353 |
'perfume': 'N/A',
|
| 354 |
+
'matching_notes': 'No matching notes found',
|
| 355 |
'match_count': 0,
|
| 356 |
+
'similarity_score': 0,
|
| 357 |
+
'purity': 0,
|
| 358 |
+
'adjusted_score': 0
|
| 359 |
}])
|
| 360 |
|
| 361 |
+
result = result.sort_values(by=['adjusted_score'], ascending=False).head(top_n).reset_index(drop=True)
|
|
|
|
| 362 |
|
| 363 |
return result
|
|
|
|
|
|
|
| 364 |
|
| 365 |
def infer(image_input):
|
| 366 |
|