Spaces:
Running
on
Zero
Running
on
Zero
update perfume comparison
Browse files
app.py
CHANGED
@@ -301,7 +301,7 @@ df = pd.read_excel('perfume_database_cleaned.xlsx')
|
|
301 |
|
302 |
def extract_notes_for_comparison(data: Union[str, dict]) -> list[str]:
|
303 |
"""
|
304 |
-
Extracts
|
305 |
"""
|
306 |
if isinstance(data, str):
|
307 |
try:
|
@@ -310,98 +310,83 @@ def extract_notes_for_comparison(data: Union[str, dict]) -> list[str]:
|
|
310 |
raise ValueError("Invalid JSON string provided")
|
311 |
|
312 |
if not isinstance(data, dict):
|
313 |
-
raise TypeError("Input must be a dict or
|
314 |
|
315 |
-
olfactory_pyramid = data.get("Olfactory Pyramid")
|
316 |
if not olfactory_pyramid:
|
317 |
-
|
318 |
|
319 |
notes = []
|
320 |
for layer in ["Top Notes", "Heart Notes", "Base Notes"]:
|
321 |
-
layer_data = olfactory_pyramid.get(layer)
|
322 |
if not layer_data:
|
323 |
continue
|
324 |
for item in layer_data:
|
325 |
-
note = item.get("note")
|
326 |
if note:
|
327 |
notes.append(note.strip())
|
328 |
|
329 |
-
if not notes:
|
330 |
-
raise ValueError("No notes found in the Olfactory Pyramid")
|
331 |
-
|
332 |
return notes
|
333 |
|
334 |
-
def find_best_perfumes_from_json(
|
335 |
-
data: Union[str, dict],
|
336 |
-
top_n: int = 5,
|
337 |
-
threshold: int = 80
|
338 |
-
) -> pd.DataFrame:
|
339 |
"""
|
340 |
-
|
|
|
341 |
"""
|
342 |
-
|
343 |
-
|
344 |
-
except Exception as e:
|
345 |
-
# Return fallback if extraction fails
|
346 |
-
return pd.DataFrame([{
|
347 |
-
'brand': 'N/A',
|
348 |
-
'perfume': 'N/A',
|
349 |
-
'matching_notes': f'Error: {str(e)}',
|
350 |
-
'match_count': 0,
|
351 |
-
'purity': 0,
|
352 |
-
'adjusted_score': 0
|
353 |
-
}])
|
354 |
-
|
355 |
-
user_notes_clean = [n.strip().lower() for n in user_notes if n.strip()]
|
356 |
|
357 |
matches = []
|
|
|
358 |
for _, row in df.iterrows():
|
359 |
-
perfume_notes = [
|
360 |
-
|
361 |
-
for n in row['notes'].split(',')
|
362 |
-
if n.strip()
|
363 |
-
]
|
364 |
|
365 |
-
matched = []
|
366 |
for u_note in user_notes_clean:
|
367 |
for p_note in perfume_notes:
|
368 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
369 |
if ratio >= threshold:
|
370 |
-
|
371 |
|
372 |
-
unique_matched_notes = sorted(set(
|
373 |
-
|
|
|
|
|
|
|
374 |
|
375 |
total_notes = len(perfume_notes)
|
376 |
-
|
377 |
-
purity = match_count / total_notes if total_notes else 0
|
378 |
adjusted_score = match_count * purity
|
379 |
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
})
|
389 |
|
390 |
if not matches:
|
|
|
391 |
return pd.DataFrame([{
|
392 |
'brand': 'N/A',
|
393 |
-
'perfume': '
|
394 |
-
'matching_notes': '
|
395 |
'match_count': 0,
|
396 |
'purity': 0,
|
397 |
'adjusted_score': 0
|
398 |
}])
|
399 |
|
400 |
result = pd.DataFrame(matches)
|
401 |
-
result = result.sort_values(
|
402 |
-
by=['adjusted_score', 'match_count'],
|
403 |
-
ascending=[False, False]
|
404 |
-
).head(top_n).reset_index(drop=True)
|
405 |
|
406 |
return result
|
407 |
|
|
|
301 |
|
302 |
def extract_notes_for_comparison(data: Union[str, dict]) -> list[str]:
|
303 |
"""
|
304 |
+
Extracts notes from Olfactory Pyramid in a JSON string or dict.
|
305 |
"""
|
306 |
if isinstance(data, str):
|
307 |
try:
|
|
|
310 |
raise ValueError("Invalid JSON string provided")
|
311 |
|
312 |
if not isinstance(data, dict):
|
313 |
+
raise TypeError("Input must be a dict or valid JSON string")
|
314 |
|
315 |
+
olfactory_pyramid = data.get("Olfactory Pyramid") or data.get("olfactory pyramid")
|
316 |
if not olfactory_pyramid:
|
317 |
+
return [] # No pyramid found, fail gracefully
|
318 |
|
319 |
notes = []
|
320 |
for layer in ["Top Notes", "Heart Notes", "Base Notes"]:
|
321 |
+
layer_data = olfactory_pyramid.get(layer) or olfactory_pyramid.get(layer.lower())
|
322 |
if not layer_data:
|
323 |
continue
|
324 |
for item in layer_data:
|
325 |
+
note = item.get("note") or item.get("Note")
|
326 |
if note:
|
327 |
notes.append(note.strip())
|
328 |
|
|
|
|
|
|
|
329 |
return notes
|
330 |
|
331 |
+
def find_best_perfumes_from_json(data: Union[str, dict], top_n: int = 5, threshold: int = 80):
|
|
|
|
|
|
|
|
|
332 |
"""
|
333 |
+
Fuzzy-match user notes against database notes.
|
334 |
+
Uses token_set_ratio + partial_ratio + short-word safeguard.
|
335 |
"""
|
336 |
+
user_notes = extract_notes_for_comparison(data)
|
337 |
+
user_notes_clean = [n.strip().lower() for n in user_notes]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
|
339 |
matches = []
|
340 |
+
|
341 |
for _, row in df.iterrows():
|
342 |
+
perfume_notes = [n.strip().lower() for n in row['notes'].split(',')]
|
343 |
+
matched_notes = []
|
|
|
|
|
|
|
344 |
|
|
|
345 |
for u_note in user_notes_clean:
|
346 |
for p_note in perfume_notes:
|
347 |
+
if len(u_note) < 4:
|
348 |
+
# Very short? Require exact match
|
349 |
+
ratio = 100 if u_note == p_note else 0
|
350 |
+
else:
|
351 |
+
ratio_token = fuzz.token_set_ratio(u_note, p_note)
|
352 |
+
ratio_partial = fuzz.partial_ratio(u_note, p_note)
|
353 |
+
ratio = max(ratio_token, ratio_partial)
|
354 |
+
|
355 |
if ratio >= threshold:
|
356 |
+
matched_notes.append(p_note)
|
357 |
|
358 |
+
unique_matched_notes = sorted(set(matched_notes))
|
359 |
+
match_count = len(unique_matched_notes)
|
360 |
+
|
361 |
+
if match_count == 0:
|
362 |
+
continue # Skip if no match at all
|
363 |
|
364 |
total_notes = len(perfume_notes)
|
365 |
+
purity = match_count / total_notes if total_notes > 0 else 0
|
|
|
366 |
adjusted_score = match_count * purity
|
367 |
|
368 |
+
matches.append({
|
369 |
+
'brand': row['brand'],
|
370 |
+
'perfume': row['perfume'],
|
371 |
+
'matching_notes': ', '.join(unique_matched_notes).strip(', '),
|
372 |
+
'match_count': match_count,
|
373 |
+
'purity': round(purity, 2),
|
374 |
+
'adjusted_score': round(adjusted_score, 2)
|
375 |
+
})
|
|
|
376 |
|
377 |
if not matches:
|
378 |
+
# Nothing matched at all
|
379 |
return pd.DataFrame([{
|
380 |
'brand': 'N/A',
|
381 |
+
'perfume': 'No match found',
|
382 |
+
'matching_notes': '',
|
383 |
'match_count': 0,
|
384 |
'purity': 0,
|
385 |
'adjusted_score': 0
|
386 |
}])
|
387 |
|
388 |
result = pd.DataFrame(matches)
|
389 |
+
result = result.sort_values(by='adjusted_score', ascending=False).head(top_n).reset_index(drop=True)
|
|
|
|
|
|
|
390 |
|
391 |
return result
|
392 |
|