Spaces:
Running
on
Zero
Running
on
Zero
typo
Browse files
app.py
CHANGED
@@ -337,7 +337,64 @@ def extract_notes_for_comparison(data: Union[str, dict]) -> list[str]:
|
|
337 |
|
338 |
from rapidfuzz import fuzz
|
339 |
|
340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
'match_count': match_count,
|
342 |
'similarity_score': round(total_ratio, 2),
|
343 |
'purity': round(purity, 2),
|
|
|
337 |
|
338 |
from rapidfuzz import fuzz
|
339 |
|
340 |
+
def find_best_perfumes_from_json(data: Union[str, dict], top_n: int = 5, threshold: int = 80) -> pd.DataFrame:
|
341 |
+
"""
|
342 |
+
Finds top N matching perfumes using fuzzy matching on notes, with purity factor.
|
343 |
+
|
344 |
+
Args:
|
345 |
+
data (Union[str, dict]): The input JSON or dict.
|
346 |
+
top_n (int): Number of results to return.
|
347 |
+
threshold (int): Minimum fuzz ratio for note match.
|
348 |
+
|
349 |
+
Returns:
|
350 |
+
pd.DataFrame: Matching perfumes.
|
351 |
+
"""
|
352 |
+
try:
|
353 |
+
user_notes = extract_notes_for_comparison(data)
|
354 |
+
except (KeyError, ValueError) as e:
|
355 |
+
return pd.DataFrame([{
|
356 |
+
'brand': 'N/A',
|
357 |
+
'perfume': 'N/A',
|
358 |
+
'matching_notes': str(e),
|
359 |
+
'match_count': 0,
|
360 |
+
'similarity_score': 0,
|
361 |
+
'purity': 0,
|
362 |
+
'adjusted_score': 0
|
363 |
+
}])
|
364 |
+
|
365 |
+
user_notes_clean = [n.strip().lower() for n in user_notes]
|
366 |
+
|
367 |
+
matches = []
|
368 |
+
|
369 |
+
for _, row in df.iterrows():
|
370 |
+
perfume_notes = [n.strip().lower() for n in row['notes'].split(',')]
|
371 |
+
total_perfume_notes = len(perfume_notes)
|
372 |
+
|
373 |
+
matched_notes = []
|
374 |
+
total_ratio = 0
|
375 |
+
|
376 |
+
for u_note in user_notes_clean:
|
377 |
+
best_ratio = 0
|
378 |
+
best_p_note = None
|
379 |
+
for p_note in perfume_notes:
|
380 |
+
ratio = fuzz.partial_ratio(u_note, p_note)
|
381 |
+
if ratio > best_ratio:
|
382 |
+
best_ratio = ratio
|
383 |
+
best_p_note = p_note
|
384 |
+
if best_ratio >= threshold and best_p_note:
|
385 |
+
matched_notes.append(best_p_note)
|
386 |
+
total_ratio += best_ratio
|
387 |
+
|
388 |
+
unique_matched_notes = sorted(set(matched_notes))
|
389 |
+
match_count = len(unique_matched_notes)
|
390 |
+
|
391 |
+
purity = match_count / total_perfume_notes if total_perfume_notes > 0 else 0
|
392 |
+
adjusted_score = purity * total_ratio
|
393 |
+
|
394 |
+
matches.append({
|
395 |
+
'brand': row['brand'],
|
396 |
+
'perfume': row['perfume'],
|
397 |
+
'matching_notes': ', '.join(unique_matched_notes),
|
398 |
'match_count': match_count,
|
399 |
'similarity_score': round(total_ratio, 2),
|
400 |
'purity': round(purity, 2),
|