Spaces:

HuggingFaceFW-Dev
/

vibe-check-translations

Sleeping

App Files Files Community

guipenedo HF Staff commited on Jul 30

Commit

c502324

1 Parent(s): 5a49ee7

added app

Browse files

Files changed (3) hide show

README.md +53 -1
app.py +292 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -10,4 +10,56 @@ pinned: false
 short_description: A/B test translations
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 short_description: A/B test translations
 ---
+# Translation A/B Testing App
+A Gradio app for comparing translation quality between different model configurations through A/B testing.
+## Features
+- **Language Selection**: Choose from available languages in the S3 bucket
+- **Side-by-Side Comparison**: Compare translations from "few-shots" vs "no-few-shots" configurations
+- **Randomized Presentation**: The order of configurations is randomized to avoid bias
+- **Progress Tracking**: Shows current progress through the dataset
+- **Results Summary**: Displays final vote counts and percentages
+## Setup
+1. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+2. Configure AWS credentials (for S3 access):
+```bash
+export AWS_ACCESS_KEY_ID=your_key
+export AWS_SECRET_ACCESS_KEY=your_secret
+# or use AWS CLI: aws configure
+```
+3. Run the app:
+```bash
+python app.py
+```
+The app will be available at `http://localhost:7860`
+## Usage
+1. **Select Language**: Choose a language from the dropdown menu
+2. **Load Data**: Click "Load Data" to fetch translation pairs from S3
+3. **Compare Translations**:
+   - Original text is shown at the top
+   - Two translations (A and B) are shown side by side
+   - Click "Choose Left" or "Choose Right" to select the better translation
+4. **View Results**: After all comparisons, see the final vote counts
+## Data Source
+The app loads translation data from `s3://fineweb-multilingual-v1/experiments/translations/vibe-checks/` with the following structure:
+- `{language}_Latn/few-shots.jsonl` - Translations with few-shot examples
+- `{language}_Latn/no-few-shots.jsonl` - Translations without few-shot examples
+Each JSONL file contains documents with:
+- `text`: Original text to translate
+- `id`: Unique document identifier
+- `inference_results`: Array with translation results

app.py ADDED Viewed

	@@ -0,0 +1,292 @@

+#!/usr/bin/env python3
+"""
+Gradio app for A/B testing different translation configs
+"""
+import gradio as gr
+import boto3
+import json
+import random
+import re
+from pathlib import Path
+from datatrove.pipeline.readers.jsonl import JsonlReader
+from collections import defaultdict
+from typing import Dict, List, Tuple, Optional
+# Initialize S3 client
+s3_client = boto3.client('s3')
+BUCKET_NAME = "fineweb-multilingual-v1"
+BASE_PREFIX = "experiments/translations/vibe-checks/"
+# Global state for the app
+app_state = {
+    'current_samples': [],
+    'current_index': 0,
+    'results': {'config_a': 0, 'config_b': 0},
+    'config_names': [],
+    'language': '',
+    'total_samples': 0
+}
+def list_languages_from_s3() -> List[str]:
+    """List available languages from S3 bucket"""
+    try:
+        response = s3_client.list_objects_v2(
+            Bucket=BUCKET_NAME,
+            Prefix=BASE_PREFIX,
+            Delimiter='/'
+        )
+        languages = []
+        if 'CommonPrefixes' in response:
+            for prefix in response['CommonPrefixes']:
+                language_folder = prefix['Prefix'].replace(BASE_PREFIX, '').rstrip('/')
+                languages.append(language_folder)
+        return sorted(languages)
+    except Exception as e:
+        print(f"Error listing languages: {e}")
+        return []
+def extract_translation(inference_result: dict) -> str:
+    """Extract translation from inference result, removing START_TRANSLATION tags"""
+    if not inference_result or 'text' not in inference_result:
+        return "No translation available"
+    text = inference_result['text']
+    # Remove START_TRANSLATION and END_TRANSLATION tags
+    pattern = r'<START_TRANSLATION>(.*?)<END_TRANSLATION>'
+    match = re.search(pattern, text, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    else:
+        # Fallback: return the text as is if no tags found
+        return text.strip()
+def load_config_data(language: str, config_name: str) -> List[dict]:
+    """Load data for a specific config from S3"""
+    try:
+        s3_path = f"s3://{BUCKET_NAME}/{BASE_PREFIX}{language}/{config_name}.jsonl"
+        print(f"Loading data from: {s3_path}")
+        # Use datatrove JsonlReader to read from S3
+        reader = JsonlReader(s3_path)
+        documents = []
+        for document in reader():
+            documents.append(document)
+        return documents
+    except Exception as e:
+        print(f"Error loading {config_name} data for {language}: {e}")
+        return []
+def prepare_ab_test_data(language: str) -> List[Tuple[dict, dict, dict]]:
+    """Prepare paired samples for A/B testing"""
+    config_names = ["few-shots", "no-few-shots"]
+    # Load data for both configs
+    config_a_data = load_config_data(language, config_names[0])
+    config_b_data = load_config_data(language, config_names[1])
+    print(f"Loaded {len(config_a_data)} samples for {config_names[0]}")
+    print(f"Loaded {len(config_b_data)} samples for {config_names[1]}")
+    # Create mappings by document ID
+    config_a_by_id = {doc.id: doc for doc in config_a_data}
+    config_b_by_id = {doc.id: doc for doc in config_b_data}
+    # Find common IDs
+    common_ids = set(config_a_by_id.keys()) & set(config_b_by_id.keys())
+    print(f"Found {len(common_ids)} common document IDs")
+    # Create paired samples
+    paired_samples = []
+    for doc_id in common_ids:
+        doc_a = config_a_by_id[doc_id]
+        doc_b = config_b_by_id[doc_id]
+        # Randomly decide which config goes on which side
+        if random.random() < 0.5:
+            left_doc, right_doc = doc_a, doc_b
+            left_config, right_config = config_names[0], config_names[1]
+        else:
+            left_doc, right_doc = doc_b, doc_a
+            left_config, right_config = config_names[1], config_names[0]
+        paired_samples.append((left_doc, right_doc, {
+            'left_config': left_config,
+            'right_config': right_config,
+            'original_text': doc_a.text  # Original text is the same for both
+        }))
+    # Shuffle the pairs
+    random.shuffle(paired_samples)
+    return paired_samples
+def load_language_data(language: str):
+    """Load and prepare data for the selected language"""
+    if not language:
+        return "Please select a language", "", "", "", "0 / 0", gr.update(visible=False), gr.update(visible=False)
+    print(f"Loading data for language: {language}")
+    # Prepare A/B test data
+    samples = prepare_ab_test_data(language)
+    if not samples:
+        return "No data found for selected language", "", "", "", "0 / 0", gr.update(visible=False), gr.update(visible=False)
+    # Update global state
+    app_state['current_samples'] = samples
+    app_state['current_index'] = 0
+    app_state['results'] = {'config_a': 0, 'config_b': 0}
+    app_state['language'] = language
+    app_state['total_samples'] = len(samples)
+    # Show first sample
+    return show_current_sample()
+def show_current_sample():
+    """Display the current sample"""
+    if not app_state['current_samples'] or app_state['current_index'] >= len(app_state['current_samples']):
+        # Show final results
+        total_votes = app_state['results']['config_a'] + app_state['results']['config_b']
+        if total_votes == 0:
+            results_text = "No votes recorded."
+        else:
+            config_a_pct = (app_state['results']['config_a'] / total_votes) * 100
+            config_b_pct = (app_state['results']['config_b'] / total_votes) * 100
+            results_text = f"""
+## Final Results for {app_state['language']}
+**Few-shots config**: {app_state['results']['config_a']} votes ({config_a_pct:.1f}%)
+**No-few-shots config**: {app_state['results']['config_b']} votes ({config_b_pct:.1f}%)
+Total comparisons: {total_votes}
+            """
+        return (
+            results_text,
+            "Testing complete!",
+            "Testing complete!",
+            "Click 'Load Data' to start over",
+            f"{app_state['current_index']} / {app_state['total_samples']}",
+            gr.update(visible=False),
+            gr.update(visible=False)
+        )
+    left_doc, right_doc, metadata = app_state['current_samples'][app_state['current_index']]
+    # Extract translations
+    left_translation = extract_translation(left_doc.inference_results[0] if left_doc.inference_results else {})
+    right_translation = extract_translation(right_doc.inference_results[0] if right_doc.inference_results else {})
+    progress = f"{app_state['current_index'] + 1} / {app_state['total_samples']}"
+    return (
+        metadata['original_text'],
+        left_translation,
+        right_translation,
+        f"Language: {app_state['language']} | Progress: {progress}",
+        progress,
+        gr.update(visible=True),
+        gr.update(visible=True)
+    )
+def vote_left():
+    """Record vote for left translation"""
+    if not app_state['current_samples'] or app_state['current_index'] >= len(app_state['current_samples']):
+        return show_current_sample()
+    # Determine which config the left side represents
+    _, _, metadata = app_state['current_samples'][app_state['current_index']]
+    left_config = metadata['left_config']
+    if left_config == "few-shots":
+        app_state['results']['config_a'] += 1
+    else:
+        app_state['results']['config_b'] += 1
+    # Move to next sample
+    app_state['current_index'] += 1
+    return show_current_sample()
+def vote_right():
+    """Record vote for right translation"""
+    if not app_state['current_samples'] or app_state['current_index'] >= len(app_state['current_samples']):
+        return show_current_sample()
+    # Determine which config the right side represents
+    _, _, metadata = app_state['current_samples'][app_state['current_index']]
+    right_config = metadata['right_config']
+    if right_config == "few-shots":
+        app_state['results']['config_a'] += 1
+    else:
+        app_state['results']['config_b'] += 1
+    # Move to next sample
+    app_state['current_index'] += 1
+    return show_current_sample()
+# Create Gradio interface
+def create_interface():
+    languages = list_languages_from_s3()
+    with gr.Blocks(title="Translation A/B Testing", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# Translation Model A/B Testing")
+        gr.Markdown("Compare translations from different model configurations. Choose the better translation for each sample.")
+        with gr.Row():
+            language_dropdown = gr.Dropdown(
+                choices=languages,
+                label="Select Language",
+                value=languages[0] if languages else None
+            )
+            load_btn = gr.Button("Load Data", variant="primary")
+        status_text = gr.Markdown("")
+        progress_text = gr.Markdown("")
+        gr.Markdown("## Original Text")
+        original_text = gr.Textbox(label="Text to Translate", lines=3, interactive=False)
+        gr.Markdown("## Choose the Better Translation")
+        with gr.Row():
+            with gr.Column():
+                left_btn = gr.Button("Choose Left", variant="secondary", visible=False)
+                left_translation = gr.Textbox(label="Translation A", lines=4, interactive=False)
+            with gr.Column():
+                right_btn = gr.Button("Choose Right", variant="secondary", visible=False)
+                right_translation = gr.Textbox(label="Translation B", lines=4, interactive=False)
+        # Event handlers
+        load_btn.click(
+            fn=load_language_data,
+            inputs=[language_dropdown],
+            outputs=[original_text, left_translation, right_translation, status_text, progress_text, left_btn, right_btn]
+        )
+        left_btn.click(
+            fn=vote_left,
+            inputs=[],
+            outputs=[original_text, left_translation, right_translation, status_text, progress_text, left_btn, right_btn]
+        )
+        right_btn.click(
+            fn=vote_right,
+            inputs=[],
+            outputs=[original_text, left_translation, right_translation, status_text, progress_text, left_btn, right_btn]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio>=4.0.0
+datatrove
+boto3
+pandas