Spaces:
Running
Running
File size: 8,144 Bytes
297437a aa57e68 e760939 c14e100 dd42795 c14e100 dd42795 c14e100 ccc2ca3 c14e100 dd42795 c14e100 aa57e68 e760939 aa57e68 e760939 aa57e68 c14e100 aa57e68 c14e100 dd42795 c3509cc dd42795 c3509cc c14e100 aa57e68 c14e100 aa57e68 c14e100 aa57e68 e760939 aa57e68 e760939 aa57e68 e760939 aa57e68 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
import gradio as gr
from lettucedetect.models.inference import HallucinationDetector
import os
title = """# 🙋🏻♂️Welcome to 🌟Tonic's 🥬 LettuceDetect - 🤯🧠 Hallucination Tester 🟢🔴
Powered by `lettucedect-large-modernbert-en-v1` from KRLabsOrg. Detect hallucinations in answers based on context and questions using ModernBERT with 8192-token context support!
"""
description2 = """
### Model Details
- **Model Name**: [lettucedect-large-modernbert-en-v1](https://huggingface.co/KRLabsOrg/lettucedect-large-modernbert-en-v1)
- **Organization**: [KRLabsOrg](https://huggingface.co/KRLabsOrg)
- **Github**: [https://github.com/KRLabsOrg/LettuceDetect](https://github.com/KRLabsOrg/LettuceDetect)
- **Architecture**: ModernBERT (Large) with extended context support up to 8192 tokens
- **Task**: Token Classification / Hallucination Detection
- **Training Dataset**: [RagTruth](https://huggingface.co/datasets/wandb/RAGTruth-processed)
- **Language**: English
- **Capabilities**: Detects hallucinated spans in answers, provides confidence scores, and calculates average confidence across detected spans.
LettuceDetect excels at processing long documents to determine if an answer aligns with the provided context, making it a powerful tool for ensuring factual accuracy.
"""
howto1 = """
### How to Use LettuceDetect Tester
1. **Enter a Context**: Provide the source text or document (e.g., "France is a country in Europe..."). This is the factual basis for evaluation.
2. **Enter a Question**: Ask something related to the context (e.g., "What is the capital of France?").
3. **Enter an Answer**: Input the response you want to check (e.g., "The capital of France is Paris. The population is 69 million.").
4. **Press Submit**: Analyze the answer for hallucinations!
"""
howto2 = """
### Understanding the Output
- **Status**:
- 🟢 = No hallucinations detected
- 🔴 = Hallucinations detected
- ⚪ = Error occurred
- **Explanation**: A brief summary of the result.
- **Highlighted Answer**: Shows the answer with hallucinated parts in **red**, labeled with confidence scores (e.g., "hallucination (conf: 0.9944)").
- **Hallucinated Spans & Confidence**: Lists each hallucinated segment with its confidence score.
- **Average Confidence**: Displays the average confidence of all detected hallucinations (e.g., "Average Confidence: 0.9944").
Use this tool to ensure your answers are grounded in reality!
"""
join_us = """
## Join us:
🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻
[Join us on Discord](https://discord.gg/n8ytYeh25n)
On 🤗Huggingface: [MultiTransformer](https://huggingface.co/MultiTransformer)
On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Data Tonic](https://github.com/multiTonic/thinking-dataset/)
🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
"""
# Initialize the LettuceDetect model
detector = HallucinationDetector(
method="transformer",
model_path="KRLabsOrg/lettucedect-large-modernbert-en-v1"
)
# Function to evaluate hallucination with LettuceDetect
def evaluate_hallucination(context, question, answer):
try:
# Get span-level predictions from LettuceDetect
predictions = detector.predict(
context=[context],
question=question,
answer=answer,
output_format="spans"
)
# Process predictions for HighlightedText
if not predictions:
return "🟢", "No hallucinations detected", [(answer, None)], "Confidence: N/A", "N/A"
highlighted_segments = []
confidence_scores = []
last_end = 0
total_confidence = 0.0
for pred in predictions:
start, end = pred['start'], pred['end']
confidence = pred['confidence']
text = pred['text']
# Add non-hallucinated text before this span
if last_end < start:
highlighted_segments.append((answer[last_end:start], None))
# Add hallucinated span with confidence as label
label_with_confidence = f"hallucination (conf: {confidence:.4f})"
highlighted_segments.append((text, label_with_confidence))
confidence_scores.append(f"'{text}' - Confidence: {confidence:.4f}")
total_confidence += confidence
last_end = end
# Add any remaining text after the last hallucination
if last_end < len(answer):
highlighted_segments.append((answer[last_end:], None))
# Calculate average confidence
avg_confidence = total_confidence / len(predictions) if predictions else 0.0
# Determine overall status
status = "🔴" if predictions else "🟢"
explanation = "Hallucinations detected" if predictions else "No hallucinations detected"
return (
status,
explanation,
highlighted_segments,
"\n".join(confidence_scores) if confidence_scores else "N/A",
f"Average Confidence: {avg_confidence:.4f}" if predictions else "N/A"
)
except Exception as e:
return "⚪", f"Error: {str(e)}", [(answer, None)], "N/A", "N/A"
# Gradio Blocks interface
with gr.Blocks(
title="🥬 LettuceDetect Hallucination Tester 🟢🔴"
) as demo:
gr.Markdown(title)
with gr.Row():
with gr.Group():
gr.Markdown(description2)
with gr.Group():
gr.Markdown(howto2)
with gr.Row():
with gr.Group():
gr.Markdown(howto1)
with gr.Group():
gr.Markdown(join_us)
with gr.Row():
with gr.Column(scale=2):
# Inputs
context_input = gr.Textbox(
label="Context",
lines=5,
placeholder="Enter the context (e.g., a document or source text)..."
)
question_input = gr.Textbox(
label="Question",
placeholder="Enter the question..."
)
answer_input = gr.Textbox(
label="Answer",
lines=3,
placeholder="Enter the answer to evaluate..."
)
submit_btn = gr.Button("Submit")
with gr.Column(scale=3):
with gr.Row():
with gr.Column():
status_output = gr.Label(label="Status")
with gr.Column():
explanation_output = gr.Textbox(label="Explanation", interactive=False)
highlighted_answer_output = gr.HighlightedText(
label="Answer with Hallucinations Highlighted",
show_legend=False,
color_map={"hallucination": "red"}, # Note: Only "hallucination" is used as base category
combine_adjacent=True
)
spans_output = gr.Textbox(label="Hallucinated Spans & Confidence", lines=5, interactive=False)
avg_confidence_output = gr.Textbox(label="Average Confidence", interactive=False)
# Connect inputs to outputs via the evaluation function
submit_btn.click(
fn=evaluate_hallucination,
inputs=[context_input, question_input, answer_input],
outputs=[status_output, explanation_output, highlighted_answer_output, spans_output, avg_confidence_output]
)
# Example
gr.Markdown("### Example")
with gr.Row():
gr.Examples(
examples=[
[
"France is a country in Europe. The capital of France is Paris. The population of France is 67 million.",
"What is the capital of France? What is the population of France?",
"The capital of France is Paris. The population of France is 69 million."
]
],
inputs=[context_input, question_input, answer_input]
)
# Launch the demo
demo.launch() |