|
"""Main Gradio application for viewing dataset with text comparison.""" |
|
|
|
import gradio as gr |
|
import random |
|
from data_loader import get_dataset_size, get_sample |
|
from diff_utils import generate_html_diff, get_diff_stats |
|
|
|
|
|
def load_sample_data(sample_index: int | None = None) -> tuple[str, str, str, str]: |
|
"""Load and process a sample from the dataset.""" |
|
if sample_index is None: |
|
sample_index = random.randint(0, get_dataset_size() - 1) |
|
|
|
|
|
sample_index = max(0, min(sample_index, get_dataset_size() - 1)) |
|
|
|
original_text, cleaned_text = get_sample(sample_index) |
|
|
|
|
|
diff_html = generate_html_diff(original_text, cleaned_text) |
|
|
|
|
|
stats = get_diff_stats(original_text, cleaned_text) |
|
stats_text = f""" |
|
**Sample #{sample_index}** |
|
|
|
- Original length: {stats['original_length']:,} characters |
|
- Cleaned length: {stats['cleaned_length']:,} characters |
|
- Characters removed: {stats['characters_removed']:,} |
|
- Similarity: {stats['similarity_ratio']}% |
|
""" |
|
|
|
return original_text, cleaned_text, diff_html, stats_text |
|
|
|
|
|
def create_interface() -> gr.Blocks: |
|
"""Create the main Gradio interface.""" |
|
with gr.Blocks( |
|
title="Dataset Text Comparison Viewer", |
|
css=""" |
|
.textbox-container { |
|
max-height: 400px; |
|
overflow-y: auto; |
|
border: 1px solid #e0e0e0; |
|
border-radius: 8px; |
|
padding: 12px; |
|
background: #fafafa; |
|
} |
|
.stats-box { |
|
background: #f8f9fa; |
|
border: 1px solid #dee2e6; |
|
border-radius: 8px; |
|
padding: 16px; |
|
margin: 8px 0; |
|
} |
|
""" |
|
) as interface: |
|
|
|
gr.Markdown("# Dataset Text Comparison Viewer") |
|
gr.Markdown("Compare original and cleaned text from **sumuks/essential-web-v1.0-sample-1M-with-cleaned-text**") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
sample_input = gr.Number( |
|
label="Sample Index", |
|
value=0, |
|
minimum=0, |
|
maximum=get_dataset_size() - 1, |
|
step=1 |
|
) |
|
|
|
with gr.Row(): |
|
load_btn = gr.Button("Load Sample", variant="primary") |
|
random_btn = gr.Button("Random Sample", variant="secondary") |
|
|
|
|
|
stats_output = gr.Markdown(label="Statistics", elem_classes=["stats-box"]) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("### Original Text") |
|
original_output = gr.Textbox( |
|
label="Original", |
|
lines=15, |
|
max_lines=20, |
|
show_copy_button=True, |
|
elem_classes=["textbox-container"] |
|
) |
|
|
|
with gr.Column(): |
|
gr.Markdown("### Cleaned Text") |
|
cleaned_output = gr.Textbox( |
|
label="Cleaned", |
|
lines=15, |
|
max_lines=20, |
|
show_copy_button=True, |
|
elem_classes=["textbox-container"] |
|
) |
|
|
|
|
|
gr.Markdown("### Diff Visualization") |
|
diff_output = gr.HTML(label="Diff") |
|
|
|
|
|
load_btn.click( |
|
fn=load_sample_data, |
|
inputs=[sample_input], |
|
outputs=[original_output, cleaned_output, diff_output, stats_output] |
|
) |
|
|
|
random_btn.click( |
|
fn=lambda: load_sample_data(None), |
|
inputs=[], |
|
outputs=[original_output, cleaned_output, diff_output, stats_output] |
|
) |
|
|
|
|
|
interface.load( |
|
fn=lambda: load_sample_data(0), |
|
inputs=[], |
|
outputs=[original_output, cleaned_output, diff_output, stats_output] |
|
) |
|
|
|
return interface |
|
|
|
|
|
if __name__ == "__main__": |
|
app = create_interface() |
|
app.launch(share=False, debug=True) |