Spaces:

JerLag
/

verbatify

Running

App Files Files Community

JerLag commited on 30 days ago

Commit

9557430

verified ·

1 Parent(s): 4857f00

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -46

app.py CHANGED Viewed

@@ -1,20 +1,31 @@
 # -*- coding: utf-8 -*-
 """
-NPS Assurance — Gradio (Paste-only)
-- Entrée: verbatims collés (1 ligne = 1 verbatim, score NPS optionnel après un séparateur, ex: "|")
-- Sorties: émotion (pos/neutre/neg), thématiques, occurrences, résumé Markdown, graphiques Plotly
-- IA (facultatif): OpenAI pour sentiment/thèmes/synthèse. Sans clé, fallback HuggingFace (si installé) puis règles lexicales.
-- Déployable tel quel sur Hugging Face Spaces (app_file = app.py)
 """
 import os, re, json, collections, tempfile, zipfile
 from typing import List, Dict, Optional
 import pandas as pd
-from unidecode import unidecode
 import gradio as gr
 import plotly.express as px
 import plotly.graph_objects as go
 # ---------------- Thésaurus ASSURANCE ----------------
 THEMES = {
     "Remboursements santé":[r"\bremboursement[s]?\b", r"\bt[eé]l[eé]transmission\b", r"\bno[eé]mie\b",
@@ -60,8 +71,7 @@ INTENSIFIERS = [r"\btr[eè]s\b", r"\bvraiment\b", r"\bextr[eê]mement\b", r"\bhy
 DIMINISHERS  = [r"\bun[e]?\s+peu\b", r"\bassez\b", r"\bplut[oô]t\b", r"\bl[eé]g[eè]rement\b"]
 INTENSIFIER_W, DIMINISHER_W = 1.5, 0.7
-# --------------- OpenAI (optionnel) ---------------
-# --- OpenAI (optionnel, robuste) ---
 OPENAI_AVAILABLE = False
 try:
     from openai import OpenAI
@@ -71,7 +81,6 @@ try:
 except Exception:
     OPENAI_AVAILABLE = False
 # ---------------- Utils ----------------
 def normalize(t:str)->str:
     if not isinstance(t,str): return ""
@@ -177,6 +186,32 @@ def oa_summary(nps:Optional[float], dist:Dict[str,int], themes_df:pd.DataFrame,
     if isinstance(j, dict): return ' '.join(str(v) for v in j.values())
     return None
 # --------- Graphiques ----------
 def fig_nps_gauge(nps: Optional[float]) -> go.Figure:
     v = 0.0 if nps is None else float(nps)
@@ -202,7 +237,7 @@ def fig_theme_balance(themes_df: pd.DataFrame, k: int) -> go.Figure:
     fig = px.bar(d2, x="theme", y="count", color="type", barmode="stack", title=f"Top {k} thèmes — balance Pos/Neg")
     fig.update_layout(xaxis_tickangle=-30); return fig
-# --------- Analyse principale ----------
 def analyze_text(pasted_txt, has_sc, sep_chr,
                  do_anonymize, use_oa_sent, use_oa_themes, use_oa_summary,
                  oa_model, oa_temp, top_k):
@@ -214,22 +249,14 @@ def analyze_text(pasted_txt, has_sc, sep_chr,
     if do_anonymize:
         df["comment"]=df["comment"].apply(anonymize)
-   # après (on retombe sur HF/règles automatiquement)
     if (use_oa_sent or use_oa_themes or use_oa_summary) and not OPENAI_AVAILABLE:
         use_oa_sent = use_oa_themes = use_oa_summary = False
-    # HF sentiment (optionnel)
-    HF_AVAILABLE=False
-    try:
-        from transformers import pipeline
-        hf_pipe = pipeline("text-classification",
-                           model="cmarkea/distilcamembert-base-sentiment",
-                           tokenizer="cmarkea/distilcamembert-base-sentiment")
-        HF_AVAILABLE=True
-    except Exception:
-        HF_AVAILABLE=False
     def hf_sent(text:str):
-        if not HF_AVAILABLE or not text.strip(): return None
         try:
             res=hf_pipe(text); lab=str(res[0]["label"]).lower(); p=float(res[0].get("score",0.5))
             if "1" in lab or "2" in lab: return {"label":"negatif","score":-4*p}
@@ -241,14 +268,15 @@ def analyze_text(pasted_txt, has_sc, sep_chr,
     rows=[]
     theme_agg=collections.defaultdict(lambda:{"mentions":0,"pos":0,"neg":0})
     used_hf=False; used_oa=False
-    for _, r in df.iterrows():
-        cid=r["id"]; comment=normalize(str(r["comment"]))
         # Sentiment: OpenAI -> HF -> règles
         sent=None
         if use_oa_sent:
-            sent=oa_sentiment(comment, oa_model, oa_temp); used_oa = used_oa or bool(sent)
         if not sent:
             hf=hf_sent(comment)
             if hf: sent=hf; used_hf=True
@@ -259,7 +287,7 @@ def analyze_text(pasted_txt, has_sc, sep_chr,
         # Thèmes: regex (+ fusion OpenAI)
         themes, counts = detect_themes_regex(comment)
         if use_oa_themes:
-            tjson=oa_themes(comment, oa_model, oa_temp)
             if isinstance(tjson, dict):
                 used_oa=True
                 for th, c in (tjson.get("counts",{}) or {}).items():
@@ -267,18 +295,37 @@ def analyze_text(pasted_txt, has_sc, sep_chr,
                         counts[th] = max(counts.get(th, 0), int(c))
                 themes = [th for th, c in counts.items() if c > 0]
-        bucket = nps_bucket(r.get("nps_score", None))
         for th, c in counts.items():
             theme_agg[th]["mentions"] += c
-            if sent["label"] == "positive":
-                theme_agg[th]["pos"] += 1
-            elif sent["label"] == "negatif":
-                theme_agg[th]["neg"] += 1
         rows.append({
-            "id": cid, "nps_score": r.get("nps_score", None), "nps_bucket": bucket,
             "comment": comment,
             "sentiment_score": round(float(sent["score"]), 3),
             "sentiment_label": sent["label"],
             "sentiment_source": "openai" if (use_oa_sent and used_oa) else ("huggingface" if used_hf else "rules"),
@@ -287,7 +334,7 @@ def analyze_text(pasted_txt, has_sc, sep_chr,
         })
     out_df=pd.DataFrame(rows)
-    nps=compute_nps(df["nps_score"])  # peut être None si pas de scores
     dist=out_df["sentiment_label"].value_counts().to_dict()
     # Stats par thème
@@ -298,11 +345,12 @@ def analyze_text(pasted_txt, has_sc, sep_chr,
                     "net_sentiment":int(d["pos"]-d["neg"])})
     themes_df=pd.DataFrame(trs).sort_values(["total_mentions","net_sentiment"],ascending=[False,False])
-    # Synthèse texte
     method = "OpenAI + HF + règles" if (use_oa_sent and used_hf) else ("OpenAI + règles" if use_oa_sent else ("HF + règles" if used_hf else "Règles"))
     lines=[ "# Synthèse NPS & ressentis clients",
             f"- **Méthode** : {method}",
-            f"- **NPS global** : {nps:.1f}" if nps is not None else "- **NPS global** : n/a" ]
     if dist:
         tot=sum(dist.values()); pos=dist.get("positive",0); neg=dist.get("negatif",0); neu=dist.get("neutre",0)
         lines.append(f"- **Répartition émotions** : positive {pos}/{tot}, neutre {neu}/{tot}, négative {neg}/{tot}")
@@ -313,10 +361,10 @@ def analyze_text(pasted_txt, has_sc, sep_chr,
     summary_md="\n".join(lines)
     if use_oa_summary:
-        md = oa_summary(nps, dist, themes_df, oa_model, oa_temp)
         if md: summary_md = md + "\n\n---\n" + summary_md
-    # Fichiers export
     tmpdir=tempfile.mkdtemp(prefix="nps_gradio_")
     enriched=os.path.join(tmpdir,"enriched_comments.csv")
     themes=os.path.join(tmpdir,"themes_stats.csv")
@@ -330,14 +378,7 @@ def analyze_text(pasted_txt, has_sc, sep_chr,
         z.write(themes,arcname="themes_stats.csv")
         z.write(summ,arcname="summary.md")
-    # Graphiques
-    fig_gauge = fig_nps_gauge(nps)
-    fig_emots = fig_sentiment_bar(dist)
-    k = max(1, int(top_k or 10))
-    fig_top   = fig_top_themes(themes_df, k)
-    fig_bal   = fig_theme_balance(themes_df, k)
-    # Panneaux (rapide)
     def make_panels(dfT: pd.DataFrame):
         if dfT is None or dfT.empty: return "—","—","—"
         pos_top = dfT.sort_values(["verbatims_pos","total_mentions"], ascending=[False,False]).head(4)
@@ -362,13 +403,18 @@ def analyze_text(pasted_txt, has_sc, sep_chr,
         return ench_md, irr_md, "\n".join(rec_lines)
     ench_md, irr_md, reco_md = make_panels(themes_df)
     return (summary_md, themes_df.head(100), out_df.head(200), [enriched, themes, summ, zip_path],
             ench_md, irr_md, reco_md, fig_gauge, fig_emots, fig_top, fig_bal)
 # ---------------- UI ----------------
 with gr.Blocks(title="NPS — Analyse (Assurance)") as demo:
-    gr.Markdown("## 🔎 NPS — Analyse sémantique (Assurance)\nColle tes verbatims (1 par ligne). Option: score NPS après un `|`.")
     with gr.Column():
         pasted = gr.Textbox(label="Verbatims (un par ligne)", lines=10,

 # -*- coding: utf-8 -*-
 """
+NPS Assurance — Gradio (Paste-only, NPS inféré)
+- Entrée : verbatims collés (1 ligne = 1 verbatim, score NPS optionnel après un séparateur, ex: "|")
+- Sorties : émotion (pos/neutre/neg), thématiques, occurrences, résumé Markdown, graphiques Plotly
+- IA (facultatif) : OpenAI pour sentiment/thèmes/synthèse (fallback vers HuggingFace si dispo, puis règles lexicales)
+- NPS inféré : si une note manque, elle est déduite du sentiment (cohérente avec NPS)
+- Tolérant : si OpenAI ou unidecode ne sont pas installés, l’app continue avec des fallback
 """
 import os, re, json, collections, tempfile, zipfile
 from typing import List, Dict, Optional
 import pandas as pd
 import gradio as gr
 import plotly.express as px
 import plotly.graph_objects as go
+# --- unidecode (fallback si paquet absent) ---
+try:
+    from unidecode import unidecode
+except Exception:
+    import unicodedata
+    def unidecode(x):
+        try:
+            return unicodedata.normalize('NFKD', str(x)).encode('ascii','ignore').decode('ascii')
+        except Exception:
+            return str(x)
 # ---------------- Thésaurus ASSURANCE ----------------
 THEMES = {
     "Remboursements santé":[r"\bremboursement[s]?\b", r"\bt[eé]l[eé]transmission\b", r"\bno[eé]mie\b",
 DIMINISHERS  = [r"\bun[e]?\s+peu\b", r"\bassez\b", r"\bplut[oô]t\b", r"\bl[eé]g[eè]rement\b"]
 INTENSIFIER_W, DIMINISHER_W = 1.5, 0.7
+# --------------- OpenAI (optionnel, robuste) ---------------
 OPENAI_AVAILABLE = False
 try:
     from openai import OpenAI
 except Exception:
     OPENAI_AVAILABLE = False
 # ---------------- Utils ----------------
 def normalize(t:str)->str:
     if not isinstance(t,str): return ""
     if isinstance(j, dict): return ' '.join(str(v) for v in j.values())
     return None
+# --------- HuggingFace sentiment (optionnel)
+def make_hf_pipe():
+    try:
+        from transformers import pipeline
+        return pipeline("text-classification",
+                        model="cmarkea/distilcamembert-base-sentiment",
+                        tokenizer="cmarkea/distilcamembert-base-sentiment")
+    except Exception:
+        return None
+# --------- Inférence de note NPS à partir du sentiment ----------
+def infer_nps_from_sentiment(label: str, score: float) -> int:
+    """
+    label: 'positive' | 'neutre' | 'negatif'
+    score: [-4..+4]
+    Retourne 0..10 aligné avec NPS (detractor ≤6, passive 7–8, promoter ≥9).
+    """
+    # Mise à l’échelle douce: [-4..+4] -> [0..10]
+    scaled = int(round((float(score) + 4.0) * 1.25))  # -4 -> 0, 0 -> 5, +4 -> 10
+    scaled = max(0, min(10, scaled))
+    if label == "positive":
+        return max(9, scaled)     # garantir promoter
+    if label == "negatif":
+        return min(6, scaled)     # garantir detractor
+    return 8 if score >= 0 else 7 # neutre
 # --------- Graphiques ----------
 def fig_nps_gauge(nps: Optional[float]) -> go.Figure:
     v = 0.0 if nps is None else float(nps)
     fig = px.bar(d2, x="theme", y="count", color="type", barmode="stack", title=f"Top {k} thèmes — balance Pos/Neg")
     fig.update_layout(xaxis_tickangle=-30); return fig
+# --------- Analyse principale (paste-only) ----------
 def analyze_text(pasted_txt, has_sc, sep_chr,
                  do_anonymize, use_oa_sent, use_oa_themes, use_oa_summary,
                  oa_model, oa_temp, top_k):
     if do_anonymize:
         df["comment"]=df["comment"].apply(anonymize)
+    # Si OpenAI indisponible, on bascule silencieusement
     if (use_oa_sent or use_oa_themes or use_oa_summary) and not OPENAI_AVAILABLE:
         use_oa_sent = use_oa_themes = use_oa_summary = False
+    hf_pipe = make_hf_pipe()
     def hf_sent(text:str):
+        if hf_pipe is None or not text.strip(): return None
         try:
             res=hf_pipe(text); lab=str(res[0]["label"]).lower(); p=float(res[0].get("score",0.5))
             if "1" in lab or "2" in lab: return {"label":"negatif","score":-4*p}
     rows=[]
     theme_agg=collections.defaultdict(lambda:{"mentions":0,"pos":0,"neg":0})
     used_hf=False; used_oa=False
+    any_inferred=False
+    for idx, r in df.iterrows():
+        cid=r.get("id", idx+1); comment=normalize(str(r["comment"]))
         # Sentiment: OpenAI -> HF -> règles
         sent=None
         if use_oa_sent:
+            sent=oa_sentiment(comment, oa_model, float(oa_temp or 0.0)); used_oa = used_oa or bool(sent)
         if not sent:
             hf=hf_sent(comment)
             if hf: sent=hf; used_hf=True
         # Thèmes: regex (+ fusion OpenAI)
         themes, counts = detect_themes_regex(comment)
         if use_oa_themes:
+            tjson=oa_themes(comment, oa_model, float(oa_temp or 0.0))
             if isinstance(tjson, dict):
                 used_oa=True
                 for th, c in (tjson.get("counts",{}) or {}).items():
                         counts[th] = max(counts.get(th, 0), int(c))
                 themes = [th for th, c in counts.items() if c > 0]
+        # Note NPS : donnée ou inférée
+        given = r.get("nps_score", None)
+        try:
+            given = int(given) if given is not None and str(given).strip() != "" else None
+        except Exception:
+            given = None
+        if given is None:
+            inferred = infer_nps_from_sentiment(sent["label"], float(sent["score"]))
+            nps_final = inferred
+            nps_source = "inferred"
+            any_inferred = True
+        else:
+            nps_final = given
+            nps_source = "given"
+        bucket = nps_bucket(nps_final)
         for th, c in counts.items():
             theme_agg[th]["mentions"] += c
+            if sent["label"] == "positive": theme_agg[th]["pos"] += 1
+            elif sent["label"] == "negatif": theme_agg[th]["neg"] += 1
         rows.append({
+            "id": cid,
             "comment": comment,
+            "nps_score_given": given,
+            "nps_score_inferred": nps_final if given is None else None,
+            "nps_score_final": nps_final,
+            "nps_source": nps_source,
+            "nps_bucket": bucket,
             "sentiment_score": round(float(sent["score"]), 3),
             "sentiment_label": sent["label"],
             "sentiment_source": "openai" if (use_oa_sent and used_oa) else ("huggingface" if used_hf else "rules"),
         })
     out_df=pd.DataFrame(rows)
+    nps=compute_nps(out_df["nps_score_final"])
     dist=out_df["sentiment_label"].value_counts().to_dict()
     # Stats par thème
                     "net_sentiment":int(d["pos"]-d["neg"])})
     themes_df=pd.DataFrame(trs).sort_values(["total_mentions","net_sentiment"],ascending=[False,False])
+    # Synthèse
     method = "OpenAI + HF + règles" if (use_oa_sent and used_hf) else ("OpenAI + règles" if use_oa_sent else ("HF + règles" if used_hf else "Règles"))
+    nps_label = "NPS global (inféré)" if any_inferred else "NPS global"
     lines=[ "# Synthèse NPS & ressentis clients",
             f"- **Méthode** : {method}",
+            f"- **{nps_label}** : {nps:.1f}" if nps is not None else f"- **{nps_label}** : n/a" ]
     if dist:
         tot=sum(dist.values()); pos=dist.get("positive",0); neg=dist.get("negatif",0); neu=dist.get("neutre",0)
         lines.append(f"- **Répartition émotions** : positive {pos}/{tot}, neutre {neu}/{tot}, négative {neg}/{tot}")
     summary_md="\n".join(lines)
     if use_oa_summary:
+        md = oa_summary(nps, dist, themes_df, oa_model, float(oa_temp or 0.0))
         if md: summary_md = md + "\n\n---\n" + summary_md
+    # Exports
     tmpdir=tempfile.mkdtemp(prefix="nps_gradio_")
     enriched=os.path.join(tmpdir,"enriched_comments.csv")
     themes=os.path.join(tmpdir,"themes_stats.csv")
         z.write(themes,arcname="themes_stats.csv")
         z.write(summ,arcname="summary.md")
+    # Panneaux & Graphes
     def make_panels(dfT: pd.DataFrame):
         if dfT is None or dfT.empty: return "—","—","—"
         pos_top = dfT.sort_values(["verbatims_pos","total_mentions"], ascending=[False,False]).head(4)
         return ench_md, irr_md, "\n".join(rec_lines)
     ench_md, irr_md, reco_md = make_panels(themes_df)
+    fig_gauge = fig_nps_gauge(nps)
+    fig_emots = fig_sentiment_bar(dist)
+    k = max(1, int(top_k or 10))
+    fig_top   = fig_top_themes(themes_df, k)
+    fig_bal   = fig_theme_balance(themes_df, k)
     return (summary_md, themes_df.head(100), out_df.head(200), [enriched, themes, summ, zip_path],
             ench_md, irr_md, reco_md, fig_gauge, fig_emots, fig_top, fig_bal)
 # ---------------- UI ----------------
 with gr.Blocks(title="NPS — Analyse (Assurance)") as demo:
+    gr.Markdown("## 🔎 NPS — Analyse sémantique (Assurance)\nCollez vos verbatims (1 par ligne). Option : note NPS après `|`.")
     with gr.Column():
         pasted = gr.Textbox(label="Verbatims (un par ligne)", lines=10,