Spaces:

maximuspowers
/

bias-detection-ner

Running

App Files Files Community

maximuspowers commited on Dec 15, 2024

Commit

e0dca16

verified ·

1 Parent(s): 34ab835

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -46

app.py CHANGED Viewed

@@ -20,8 +20,6 @@ id2label = {
     6: 'I-UNFAIR'
 }
-label2id = {v: k for k, v in id2label.items()}
 # Entity colors for highlights
 label_colors = {
     "STEREO": "rgba(255, 0, 0, 0.2)",  # Light Red
@@ -34,38 +32,23 @@ def post_process_entities(result):
     prev_entity_type = None
     for token_data in result:
         labels = token_data["labels"]
-        labels = list(set(labels))
-        # Handle conflicting B- and I- tags for the same entity
-        for entity_type in ["GEN", "UNFAIR", "STEREO"]:
-            if f"B-{entity_type}" in labels and f"I-{entity_type}" in labels:
-                labels.remove(f"I-{entity_type}")
         # Handle sequence rules
-        current_entity_type = None
-        current_label = None
-        for label in labels:
-            if label.startswith("B-") or label.startswith("I-"):
-                current_label = label
-                current_entity_type = label[2:]
-        if current_entity_type:
-            if current_label.startswith("B-") and prev_entity_type == current_entity_type:
-                labels.remove(current_label)
-                labels.append(f"I-{current_entity_type}")
-            if current_label.startswith("I-") and prev_entity_type != current_entity_type:
-                labels.remove(current_label)
-                labels.append(f"B-{current_entity_type}")
-            prev_entity_type = current_entity_type
-        else:
-            prev_entity_type = None
-        token_data["labels"] = labels
     return result
-# Generate JSON results
-def generate_json(sentence):
     inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
     input_ids = inputs['input_ids'].to(model.device)
     attention_mask = inputs['attention_mask'].to(model.device)
@@ -74,26 +57,24 @@ def generate_json(sentence):
         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
         logits = outputs.logits
         probabilities = torch.sigmoid(logits)
-        predicted_labels = (probabilities > 0.5).int()
     tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
     result = []
     for i, token in enumerate(tokens):
         if token not in tokenizer.all_special_tokens:
-            label_indices = (predicted_labels[0][i] == 1).nonzero(as_tuple=False).squeeze(-1)
-            labels = [id2label[idx.item()] for idx in label_indices] if label_indices.numel() > 0 else ['O']
             result.append({"token": token.replace("##", ""), "labels": labels})
     result = post_process_entities(result)
-    return json.dumps(result, indent=4)
-# Predict function
-def predict_ner_tags_with_json(sentence):
-    json_result = generate_json(sentence)
-    result = json.loads(json_result)
     word_row = []
     stereo_row = []
     gen_row = []
@@ -105,19 +86,28 @@ def predict_ner_tags_with_json(sentence):
         word_row.append(f"<span style='font-weight:bold;'>{token}</span>")
-        stereo_labels = [label[2:] for label in labels if "STEREO" in label]
         stereo_row.append(
             f"<span style='background:{label_colors['STEREO']}; border-radius:6px; padding:2px 5px;'>{', '.join(stereo_labels)}</span>"
             if stereo_labels else "&nbsp;"
         )
-        gen_labels = [label[2:] for label in labels if "GEN" in label]
         gen_row.append(
             f"<span style='background:{label_colors['GEN']}; border-radius:6px; padding:2px 5px;'>{', '.join(gen_labels)}</span>"
             if gen_labels else "&nbsp;"
         )
-        unfair_labels = [label[2:] for label in labels if "UNFAIR" in label]
         unfair_row.append(
             f"<span style='background:{label_colors['UNFAIR']}; border-radius:6px; padding:2px 5px;'>{', '.join(unfair_labels)}</span>"
             if unfair_labels else "&nbsp;"
@@ -144,7 +134,7 @@ def predict_ner_tags_with_json(sentence):
     </table>
     """
-    return f"{matrix_html}<br><pre>{json_result}</pre>"
 # Gradio Interface
 iface = gr.Blocks()
@@ -168,7 +158,7 @@ with iface:
     with gr.Row():
         input_box = gr.Textbox(label="Input Sentence")
     with gr.Row():
-        output_box = gr.HTML(label="Entity Matrix and JSON Output")
     input_box.change(predict_ner_tags_with_json, inputs=[input_box], outputs=[output_box])

     6: 'I-UNFAIR'
 }
 # Entity colors for highlights
 label_colors = {
     "STEREO": "rgba(255, 0, 0, 0.2)",  # Light Red
     prev_entity_type = None
     for token_data in result:
         labels = token_data["labels"]
         # Handle sequence rules
+        new_labels = []
+        for label_data in labels:
+            label = label_data['label']
+            if label.startswith("B-") and prev_entity_type == label[2:]:
+                new_labels.append({"label": f"I-{label[2:]}", "confidence": label_data["confidence"]})
+            elif label.startswith("I-") and prev_entity_type != label[2:]:
+                new_labels.append({"label": f"B-{label[2:]}", "confidence": label_data["confidence"]})
+            else:
+                new_labels.append(label_data)
+            prev_entity_type = label[2:]
+        token_data["labels"] = new_labels
     return result
+# Generate JSON results with probabilities
+def predict_ner_tags_with_json(sentence):
     inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
     input_ids = inputs['input_ids'].to(model.device)
     attention_mask = inputs['attention_mask'].to(model.device)
         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
         logits = outputs.logits
         probabilities = torch.sigmoid(logits)
     tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
     result = []
     for i, token in enumerate(tokens):
         if token not in tokenizer.all_special_tokens:
+            label_indices = (probabilities[0][i] > 0.52).nonzero(as_tuple=False).squeeze(-1)
+            labels = [
+                {
+                    "label": id2label[idx.item()],
+                    "confidence": round(probabilities[0][i][idx].item() * 100, 2)
+                }
+                for idx in label_indices
+            ]
             result.append({"token": token.replace("##", ""), "labels": labels})
     result = post_process_entities(result)
+    # Create table rows
     word_row = []
     stereo_row = []
     gen_row = []
         word_row.append(f"<span style='font-weight:bold;'>{token}</span>")
+        # STEREO
+        stereo_labels = [
+            f"{label_data['label'][2:]} ({label_data['confidence']}%)" for label_data in labels if "STEREO" in label_data["label"]
+        ]
         stereo_row.append(
             f"<span style='background:{label_colors['STEREO']}; border-radius:6px; padding:2px 5px;'>{', '.join(stereo_labels)}</span>"
             if stereo_labels else "&nbsp;"
         )
+        # GEN
+        gen_labels = [
+            f"{label_data['label'][2:]} ({label_data['confidence']}%)" for label_data in labels if "GEN" in label_data["label"]
+        ]
         gen_row.append(
             f"<span style='background:{label_colors['GEN']}; border-radius:6px; padding:2px 5px;'>{', '.join(gen_labels)}</span>"
             if gen_labels else "&nbsp;"
         )
+        # UNFAIR
+        unfair_labels = [
+            f"{label_data['label'][2:]} ({label_data['confidence']}%)" for label_data in labels if "UNFAIR" in label_data["label"]
+        ]
         unfair_row.append(
             f"<span style='background:{label_colors['UNFAIR']}; border-radius:6px; padding:2px 5px;'>{', '.join(unfair_labels)}</span>"
             if unfair_labels else "&nbsp;"
     </table>
     """
+    return matrix_html
 # Gradio Interface
 iface = gr.Blocks()
     with gr.Row():
         input_box = gr.Textbox(label="Input Sentence")
     with gr.Row():
+        output_box = gr.HTML(label="Entity Matrix")
     input_box.change(predict_ner_tags_with_json, inputs=[input_box], outputs=[output_box])