guipenedo HF Staff commited on
Commit
c502324
·
1 Parent(s): 5a49ee7
Files changed (3) hide show
  1. README.md +53 -1
  2. app.py +292 -0
  3. requirements.txt +4 -0
README.md CHANGED
@@ -10,4 +10,56 @@ pinned: false
10
  short_description: A/B test translations
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  short_description: A/B test translations
11
  ---
12
 
13
+ # Translation A/B Testing App
14
+
15
+ A Gradio app for comparing translation quality between different model configurations through A/B testing.
16
+
17
+ ## Features
18
+
19
+ - **Language Selection**: Choose from available languages in the S3 bucket
20
+ - **Side-by-Side Comparison**: Compare translations from "few-shots" vs "no-few-shots" configurations
21
+ - **Randomized Presentation**: The order of configurations is randomized to avoid bias
22
+ - **Progress Tracking**: Shows current progress through the dataset
23
+ - **Results Summary**: Displays final vote counts and percentages
24
+
25
+ ## Setup
26
+
27
+ 1. Install dependencies:
28
+ ```bash
29
+ pip install -r requirements.txt
30
+ ```
31
+
32
+ 2. Configure AWS credentials (for S3 access):
33
+ ```bash
34
+ export AWS_ACCESS_KEY_ID=your_key
35
+ export AWS_SECRET_ACCESS_KEY=your_secret
36
+ # or use AWS CLI: aws configure
37
+ ```
38
+
39
+ 3. Run the app:
40
+ ```bash
41
+ python app.py
42
+ ```
43
+
44
+ The app will be available at `http://localhost:7860`
45
+
46
+ ## Usage
47
+
48
+ 1. **Select Language**: Choose a language from the dropdown menu
49
+ 2. **Load Data**: Click "Load Data" to fetch translation pairs from S3
50
+ 3. **Compare Translations**:
51
+ - Original text is shown at the top
52
+ - Two translations (A and B) are shown side by side
53
+ - Click "Choose Left" or "Choose Right" to select the better translation
54
+ 4. **View Results**: After all comparisons, see the final vote counts
55
+
56
+ ## Data Source
57
+
58
+ The app loads translation data from `s3://fineweb-multilingual-v1/experiments/translations/vibe-checks/` with the following structure:
59
+ - `{language}_Latn/few-shots.jsonl` - Translations with few-shot examples
60
+ - `{language}_Latn/no-few-shots.jsonl` - Translations without few-shot examples
61
+
62
+ Each JSONL file contains documents with:
63
+ - `text`: Original text to translate
64
+ - `id`: Unique document identifier
65
+ - `inference_results`: Array with translation results
app.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Gradio app for A/B testing different translation configs
4
+ """
5
+
6
+ import gradio as gr
7
+ import boto3
8
+ import json
9
+ import random
10
+ import re
11
+ from pathlib import Path
12
+ from datatrove.pipeline.readers.jsonl import JsonlReader
13
+ from collections import defaultdict
14
+ from typing import Dict, List, Tuple, Optional
15
+
16
+ # Initialize S3 client
17
+ s3_client = boto3.client('s3')
18
+ BUCKET_NAME = "fineweb-multilingual-v1"
19
+ BASE_PREFIX = "experiments/translations/vibe-checks/"
20
+
21
+ # Global state for the app
22
+ app_state = {
23
+ 'current_samples': [],
24
+ 'current_index': 0,
25
+ 'results': {'config_a': 0, 'config_b': 0},
26
+ 'config_names': [],
27
+ 'language': '',
28
+ 'total_samples': 0
29
+ }
30
+
31
+ def list_languages_from_s3() -> List[str]:
32
+ """List available languages from S3 bucket"""
33
+ try:
34
+ response = s3_client.list_objects_v2(
35
+ Bucket=BUCKET_NAME,
36
+ Prefix=BASE_PREFIX,
37
+ Delimiter='/'
38
+ )
39
+
40
+ languages = []
41
+ if 'CommonPrefixes' in response:
42
+ for prefix in response['CommonPrefixes']:
43
+ language_folder = prefix['Prefix'].replace(BASE_PREFIX, '').rstrip('/')
44
+ languages.append(language_folder)
45
+
46
+ return sorted(languages)
47
+ except Exception as e:
48
+ print(f"Error listing languages: {e}")
49
+ return []
50
+
51
+ def extract_translation(inference_result: dict) -> str:
52
+ """Extract translation from inference result, removing START_TRANSLATION tags"""
53
+ if not inference_result or 'text' not in inference_result:
54
+ return "No translation available"
55
+
56
+ text = inference_result['text']
57
+ # Remove START_TRANSLATION and END_TRANSLATION tags
58
+ pattern = r'<START_TRANSLATION>(.*?)<END_TRANSLATION>'
59
+ match = re.search(pattern, text, re.DOTALL)
60
+ if match:
61
+ return match.group(1).strip()
62
+ else:
63
+ # Fallback: return the text as is if no tags found
64
+ return text.strip()
65
+
66
+ def load_config_data(language: str, config_name: str) -> List[dict]:
67
+ """Load data for a specific config from S3"""
68
+ try:
69
+ s3_path = f"s3://{BUCKET_NAME}/{BASE_PREFIX}{language}/{config_name}.jsonl"
70
+ print(f"Loading data from: {s3_path}")
71
+
72
+ # Use datatrove JsonlReader to read from S3
73
+ reader = JsonlReader(s3_path)
74
+ documents = []
75
+
76
+ for document in reader():
77
+ documents.append(document)
78
+
79
+ return documents
80
+ except Exception as e:
81
+ print(f"Error loading {config_name} data for {language}: {e}")
82
+ return []
83
+
84
+ def prepare_ab_test_data(language: str) -> List[Tuple[dict, dict, dict]]:
85
+ """Prepare paired samples for A/B testing"""
86
+ config_names = ["few-shots", "no-few-shots"]
87
+
88
+ # Load data for both configs
89
+ config_a_data = load_config_data(language, config_names[0])
90
+ config_b_data = load_config_data(language, config_names[1])
91
+
92
+ print(f"Loaded {len(config_a_data)} samples for {config_names[0]}")
93
+ print(f"Loaded {len(config_b_data)} samples for {config_names[1]}")
94
+
95
+ # Create mappings by document ID
96
+ config_a_by_id = {doc.id: doc for doc in config_a_data}
97
+ config_b_by_id = {doc.id: doc for doc in config_b_data}
98
+
99
+ # Find common IDs
100
+ common_ids = set(config_a_by_id.keys()) & set(config_b_by_id.keys())
101
+ print(f"Found {len(common_ids)} common document IDs")
102
+
103
+ # Create paired samples
104
+ paired_samples = []
105
+ for doc_id in common_ids:
106
+ doc_a = config_a_by_id[doc_id]
107
+ doc_b = config_b_by_id[doc_id]
108
+
109
+ # Randomly decide which config goes on which side
110
+ if random.random() < 0.5:
111
+ left_doc, right_doc = doc_a, doc_b
112
+ left_config, right_config = config_names[0], config_names[1]
113
+ else:
114
+ left_doc, right_doc = doc_b, doc_a
115
+ left_config, right_config = config_names[1], config_names[0]
116
+
117
+ paired_samples.append((left_doc, right_doc, {
118
+ 'left_config': left_config,
119
+ 'right_config': right_config,
120
+ 'original_text': doc_a.text # Original text is the same for both
121
+ }))
122
+
123
+ # Shuffle the pairs
124
+ random.shuffle(paired_samples)
125
+
126
+ return paired_samples
127
+
128
+ def load_language_data(language: str):
129
+ """Load and prepare data for the selected language"""
130
+ if not language:
131
+ return "Please select a language", "", "", "", "0 / 0", gr.update(visible=False), gr.update(visible=False)
132
+
133
+ print(f"Loading data for language: {language}")
134
+
135
+ # Prepare A/B test data
136
+ samples = prepare_ab_test_data(language)
137
+
138
+ if not samples:
139
+ return "No data found for selected language", "", "", "", "0 / 0", gr.update(visible=False), gr.update(visible=False)
140
+
141
+ # Update global state
142
+ app_state['current_samples'] = samples
143
+ app_state['current_index'] = 0
144
+ app_state['results'] = {'config_a': 0, 'config_b': 0}
145
+ app_state['language'] = language
146
+ app_state['total_samples'] = len(samples)
147
+
148
+ # Show first sample
149
+ return show_current_sample()
150
+
151
+ def show_current_sample():
152
+ """Display the current sample"""
153
+ if not app_state['current_samples'] or app_state['current_index'] >= len(app_state['current_samples']):
154
+ # Show final results
155
+ total_votes = app_state['results']['config_a'] + app_state['results']['config_b']
156
+ if total_votes == 0:
157
+ results_text = "No votes recorded."
158
+ else:
159
+ config_a_pct = (app_state['results']['config_a'] / total_votes) * 100
160
+ config_b_pct = (app_state['results']['config_b'] / total_votes) * 100
161
+ results_text = f"""
162
+ ## Final Results for {app_state['language']}
163
+
164
+ **Few-shots config**: {app_state['results']['config_a']} votes ({config_a_pct:.1f}%)
165
+ **No-few-shots config**: {app_state['results']['config_b']} votes ({config_b_pct:.1f}%)
166
+
167
+ Total comparisons: {total_votes}
168
+ """
169
+
170
+ return (
171
+ results_text,
172
+ "Testing complete!",
173
+ "Testing complete!",
174
+ "Click 'Load Data' to start over",
175
+ f"{app_state['current_index']} / {app_state['total_samples']}",
176
+ gr.update(visible=False),
177
+ gr.update(visible=False)
178
+ )
179
+
180
+ left_doc, right_doc, metadata = app_state['current_samples'][app_state['current_index']]
181
+
182
+ # Extract translations
183
+ left_translation = extract_translation(left_doc.inference_results[0] if left_doc.inference_results else {})
184
+ right_translation = extract_translation(right_doc.inference_results[0] if right_doc.inference_results else {})
185
+
186
+ progress = f"{app_state['current_index'] + 1} / {app_state['total_samples']}"
187
+
188
+ return (
189
+ metadata['original_text'],
190
+ left_translation,
191
+ right_translation,
192
+ f"Language: {app_state['language']} | Progress: {progress}",
193
+ progress,
194
+ gr.update(visible=True),
195
+ gr.update(visible=True)
196
+ )
197
+
198
+ def vote_left():
199
+ """Record vote for left translation"""
200
+ if not app_state['current_samples'] or app_state['current_index'] >= len(app_state['current_samples']):
201
+ return show_current_sample()
202
+
203
+ # Determine which config the left side represents
204
+ _, _, metadata = app_state['current_samples'][app_state['current_index']]
205
+ left_config = metadata['left_config']
206
+
207
+ if left_config == "few-shots":
208
+ app_state['results']['config_a'] += 1
209
+ else:
210
+ app_state['results']['config_b'] += 1
211
+
212
+ # Move to next sample
213
+ app_state['current_index'] += 1
214
+
215
+ return show_current_sample()
216
+
217
+ def vote_right():
218
+ """Record vote for right translation"""
219
+ if not app_state['current_samples'] or app_state['current_index'] >= len(app_state['current_samples']):
220
+ return show_current_sample()
221
+
222
+ # Determine which config the right side represents
223
+ _, _, metadata = app_state['current_samples'][app_state['current_index']]
224
+ right_config = metadata['right_config']
225
+
226
+ if right_config == "few-shots":
227
+ app_state['results']['config_a'] += 1
228
+ else:
229
+ app_state['results']['config_b'] += 1
230
+
231
+ # Move to next sample
232
+ app_state['current_index'] += 1
233
+
234
+ return show_current_sample()
235
+
236
+ # Create Gradio interface
237
+ def create_interface():
238
+ languages = list_languages_from_s3()
239
+
240
+ with gr.Blocks(title="Translation A/B Testing", theme=gr.themes.Soft()) as demo:
241
+ gr.Markdown("# Translation Model A/B Testing")
242
+ gr.Markdown("Compare translations from different model configurations. Choose the better translation for each sample.")
243
+
244
+ with gr.Row():
245
+ language_dropdown = gr.Dropdown(
246
+ choices=languages,
247
+ label="Select Language",
248
+ value=languages[0] if languages else None
249
+ )
250
+ load_btn = gr.Button("Load Data", variant="primary")
251
+
252
+ status_text = gr.Markdown("")
253
+ progress_text = gr.Markdown("")
254
+
255
+ gr.Markdown("## Original Text")
256
+ original_text = gr.Textbox(label="Text to Translate", lines=3, interactive=False)
257
+
258
+ gr.Markdown("## Choose the Better Translation")
259
+
260
+ with gr.Row():
261
+ with gr.Column():
262
+ left_btn = gr.Button("Choose Left", variant="secondary", visible=False)
263
+ left_translation = gr.Textbox(label="Translation A", lines=4, interactive=False)
264
+
265
+ with gr.Column():
266
+ right_btn = gr.Button("Choose Right", variant="secondary", visible=False)
267
+ right_translation = gr.Textbox(label="Translation B", lines=4, interactive=False)
268
+
269
+ # Event handlers
270
+ load_btn.click(
271
+ fn=load_language_data,
272
+ inputs=[language_dropdown],
273
+ outputs=[original_text, left_translation, right_translation, status_text, progress_text, left_btn, right_btn]
274
+ )
275
+
276
+ left_btn.click(
277
+ fn=vote_left,
278
+ inputs=[],
279
+ outputs=[original_text, left_translation, right_translation, status_text, progress_text, left_btn, right_btn]
280
+ )
281
+
282
+ right_btn.click(
283
+ fn=vote_right,
284
+ inputs=[],
285
+ outputs=[original_text, left_translation, right_translation, status_text, progress_text, left_btn, right_btn]
286
+ )
287
+
288
+ return demo
289
+
290
+ if __name__ == "__main__":
291
+ demo = create_interface()
292
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ datatrove
3
+ boto3
4
+ pandas