sumuks commited on
Commit
0e100cf
Β·
verified Β·
1 Parent(s): 0f56929

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +771 -0
app.py ADDED
@@ -0,0 +1,771 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Enhanced web document annotation tool with modern UI."""
3
+ import hashlib
4
+ import json
5
+ import os
6
+ import uuid
7
+ from collections import defaultdict
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime, timezone
10
+ from pathlib import Path
11
+ from random import sample, shuffle
12
+
13
+ import gradio as gr
14
+ from datasets import Dataset, load_dataset
15
+ from loguru import logger
16
+
17
+ # FDC (Free Decimal Correspondence) constants
18
+ SCIENCE_CODES = ["50", "51", "54", "57", "58", "59", "61"]
19
+ FDC_KEEP = ["61"] # Medicine
20
+
21
+
22
+ def prefix(dds_code: str) -> str:
23
+ """Extract the first two digits from a DDS code."""
24
+ if not dds_code:
25
+ return ""
26
+ return dds_code[:2]
27
+
28
+
29
+ def doc_hash(url: str, text: str) -> str:
30
+ return hashlib.sha256(f"{url}{text}".encode()).hexdigest()
31
+
32
+
33
+ def filterfunc(x: dict) -> bool:
34
+ if len(x.get("text", "").split()) < 100:
35
+ return False
36
+
37
+ excluded = {"Promotional/Advertisement", "Machine-Generated", "Images/Videos/Audio",
38
+ "Truncated", "Spam/Ads", "Product Page", "Content Listing"}
39
+
40
+ for version in ["document_type_v1", "document_type_v2"]:
41
+ for level in ["primary", "secondary"]:
42
+ if label := x.get("eai_taxonomy", {}).get(version, {}).get(level, {}).get("label"):
43
+ if label in excluded:
44
+ return False
45
+
46
+ # FDC pairing: Medicine (61) with another science code
47
+ dds_primary = x.get("eai_taxonomy", {}).get("dds", {}).get("primary", {}).get("label", "")
48
+ dds_secondary = x.get("eai_taxonomy", {}).get("dds", {}).get("secondary", {}).get("label", "")
49
+
50
+ # Check if document has FDC pairing (Medicine with science code)
51
+ fdc_paired = (
52
+ (prefix(dds_primary) in FDC_KEEP and prefix(dds_secondary) in SCIENCE_CODES) or
53
+ (prefix(dds_secondary) in FDC_KEEP and prefix(dds_primary) in SCIENCE_CODES)
54
+ )
55
+
56
+ if not fdc_paired:
57
+ return False
58
+
59
+ return True
60
+
61
+
62
+ class DocLoader:
63
+ __slots__ = ("docs", "index", "processed", "_dataset")
64
+
65
+ def __init__(self, processed: set[str]):
66
+ self.processed = processed
67
+ self.index = 0
68
+ self.docs = []
69
+ self._dataset = {}
70
+ self._load()
71
+
72
+ def _load(self):
73
+ ds = load_dataset("sumuks/essential-web-v1.0-sample-100M", split="train")
74
+ logger.info(f"Loaded {len(ds)} documents")
75
+
76
+ ds = ds.filter(filterfunc)
77
+ logger.info(f"Filtered to {len(ds)} documents")
78
+
79
+ # Build dataset lookup and collect unprocessed docs
80
+ unprocessed = []
81
+
82
+ for idx, doc in enumerate(ds):
83
+ doc_key = doc.get("id", idx)
84
+ doc_with_key = dict(doc)
85
+ doc_with_key["_dataset_key"] = doc_key
86
+ self._dataset[doc_key] = doc_with_key
87
+
88
+ # Check if already processed
89
+ url = doc.get("metadata", {}).get("url", doc.get("url", ""))
90
+ h = doc_hash(url, doc.get("text", ""))
91
+
92
+ if h not in self.processed:
93
+ unprocessed.append(doc_with_key)
94
+
95
+ logger.info(f"Found {len(unprocessed)} unprocessed documents")
96
+
97
+ # Randomize the order for this session
98
+ shuffle(unprocessed)
99
+ self.docs = unprocessed
100
+
101
+ logger.info(f"Loaded {len(self.docs)} documents for this session")
102
+
103
+ def next(self) -> dict | None:
104
+ if self.index < len(self.docs):
105
+ doc = self.docs[self.index]
106
+ self.index += 1
107
+ return doc
108
+ return None
109
+
110
+ def get_by_id(self, doc_id: str | int) -> dict | None:
111
+ result = self._dataset.get(doc_id)
112
+ if result is None and isinstance(doc_id, str) and doc_id.isdigit():
113
+ result = self._dataset.get(int(doc_id))
114
+ elif result is None and isinstance(doc_id, int):
115
+ result = self._dataset.get(str(doc_id))
116
+ return result
117
+
118
+ @property
119
+ def remaining(self) -> int:
120
+ return max(0, len(self.docs) - self.index)
121
+
122
+
123
+ @dataclass(slots=True)
124
+ class AnnotationStore:
125
+ path: Path
126
+ session_id: str = field(default_factory=lambda: str(uuid.uuid4()))
127
+ buffer: list[dict] = field(default_factory=list)
128
+ threshold: int = 25
129
+ processed: set[str] = field(default_factory=set)
130
+ annotations: list[dict] = field(default_factory=list)
131
+ session_stats: dict = field(default_factory=lambda: {
132
+ "total": 0,
133
+ "selected": 0,
134
+ "discarded": 0,
135
+ "start_time": datetime.now(timezone.utc),
136
+ "decisions": []
137
+ })
138
+
139
+ def __post_init__(self):
140
+ self.path.parent.mkdir(parents=True, exist_ok=True)
141
+ if self.path.exists():
142
+ for line in self.path.read_text().splitlines():
143
+ if rec := self._parse_line(line):
144
+ self.processed.add(rec["hash"])
145
+ self.annotations.append(rec)
146
+ logger.info(f"Loaded {len(self.processed)} existing annotations")
147
+
148
+ def _parse_line(self, line: str) -> dict | None:
149
+ try:
150
+ return json.loads(line)
151
+ except:
152
+ return None
153
+
154
+ def add(self, doc_hash: str, decision: str, doc_id: str | int):
155
+ if doc_hash in self.processed:
156
+ logger.warning(f"Attempted to add already processed document: {doc_hash}")
157
+ return
158
+
159
+ rec = {
160
+ "hash": doc_hash,
161
+ "decision": decision,
162
+ "session": self.session_id,
163
+ "id": doc_id,
164
+ "timestamp": datetime.now(timezone.utc).isoformat(),
165
+ }
166
+
167
+ self.path.open("a").write(json.dumps(rec) + "\n")
168
+ self.processed.add(doc_hash)
169
+ self.buffer.append(rec)
170
+ self.annotations.append(rec)
171
+
172
+ self.session_stats["total"] += 1
173
+ if decision == "selected":
174
+ self.session_stats["selected"] += 1
175
+ elif decision == "discarded":
176
+ self.session_stats["discarded"] += 1
177
+ self.session_stats["decisions"].append((datetime.now(timezone.utc), decision))
178
+
179
+ if len(self.buffer) >= self.threshold:
180
+ self.flush()
181
+
182
+ def flush(self):
183
+ if not self.buffer or not (token := os.getenv("HF_TOKEN")):
184
+ self.buffer.clear()
185
+ return
186
+
187
+ try:
188
+ Dataset.from_list(self.buffer).push_to_hub(
189
+ "yourbench/essential-web-annotations",
190
+ token=token
191
+ )
192
+ logger.info(f"Pushed {len(self.buffer)} annotations")
193
+ self.buffer.clear()
194
+ except Exception as e:
195
+ logger.error(f"Push failed: {e}")
196
+
197
+ def get_rate(self) -> float:
198
+ if not self.session_stats["decisions"]:
199
+ return 0.0
200
+ elapsed = (datetime.now(timezone.utc) - self.session_stats["start_time"]).total_seconds()
201
+ return (self.session_stats["total"] / elapsed * 3600) if elapsed > 0 else 0.0
202
+
203
+ def get_filtered(self, decision: str | None = None) -> list[dict]:
204
+ if decision is None or decision == "all":
205
+ return self.annotations
206
+ return [a for a in self.annotations if a.get("decision") == decision]
207
+
208
+
209
+ SESSION_LIMIT = 50
210
+
211
+ store = AnnotationStore(Path("data/annotations.jsonl"))
212
+ loader = DocLoader(store.processed)
213
+ current = loader.next()
214
+
215
+ # Viewer state
216
+ viewer_state = {
217
+ "annotations": [],
218
+ "index": 0,
219
+ "filter": "all"
220
+ }
221
+
222
+
223
+ def format_stats() -> str:
224
+ stats = store.session_stats
225
+ rate = store.get_rate()
226
+
227
+ return f"""
228
+ <div class="stats-container">
229
+ <div class="stat-item">
230
+ <div class="stat-value">{stats['total']}</div>
231
+ <div class="stat-label">Total Annotated</div>
232
+ </div>
233
+ <div class="stat-item">
234
+ <div class="stat-value">{stats['selected']}</div>
235
+ <div class="stat-label">Selected</div>
236
+ </div>
237
+ <div class="stat-item">
238
+ <div class="stat-value">{stats['discarded']}</div>
239
+ <div class="stat-label">Discarded</div>
240
+ </div>
241
+ <div class="stat-item">
242
+ <div class="stat-value">{rate:.0f}/hr</div>
243
+ <div class="stat-label">Annotation Rate</div>
244
+ </div>
245
+ <div class="stat-item">
246
+ <div class="stat-value">{loader.remaining:,}</div>
247
+ <div class="stat-label">Remaining Docs</div>
248
+ </div>
249
+ </div>
250
+ """
251
+
252
+
253
+ def format_progress() -> tuple[str, float]:
254
+ session_completed = store.session_stats["total"]
255
+ session_total = SESSION_LIMIT
256
+ progress = (session_completed / session_total) if session_total > 0 else 0
257
+
258
+ percentage = progress * 100
259
+
260
+ return (
261
+ f"""
262
+ <div class="progress-container">
263
+ <div class="progress-header">
264
+ <span class="progress-title">Session Progress</span>
265
+ <span class="progress-numbers">{session_completed:,} / {session_total:,}</span>
266
+ </div>
267
+ <div class="progress-bar-bg">
268
+ <div class="progress-bar-fill" style="width: {percentage:.1f}%"></div>
269
+ </div>
270
+ <div class="progress-percentage">{percentage:.1f}% Complete</div>
271
+ </div>
272
+ """,
273
+ progress
274
+ )
275
+
276
+
277
+ def format_document_info(doc: dict, annotation: dict | None = None) -> str:
278
+ if not doc:
279
+ return ""
280
+
281
+ meta = doc.get("metadata", {})
282
+ url = meta.get("url", doc.get("url", ""))
283
+ domain = url.split('/')[2] if url and '/' in url else "Unknown"
284
+
285
+ cat = doc.get("eai_taxonomy", {}).get("document_type_v2", {}).get("primary", {}).get("label", "Uncategorized")
286
+
287
+ word_count = len(doc.get("text", "").split())
288
+
289
+ annotation_info = ""
290
+ if annotation:
291
+ timestamp = datetime.fromisoformat(annotation["timestamp"].replace("Z", "+00:00"))
292
+ decision_color = "#667eea" if annotation["decision"] == "selected" else "#f5576c"
293
+ annotation_info = f"""
294
+ <div class="annotation-info" style="border-left: 4px solid {decision_color};">
295
+ <span class="annotation-decision" style="color: {decision_color};">
296
+ {"βœ…" if annotation["decision"] == "selected" else "❌"} {annotation["decision"].title()}
297
+ </span>
298
+ <span class="annotation-time">πŸ“… {timestamp.strftime("%Y-%m-%d %H:%M:%S")}</span>
299
+ </div>
300
+ """
301
+
302
+ return f"""
303
+ <div class="doc-info">
304
+ {annotation_info}
305
+ <div class="doc-meta">
306
+ <span class="doc-domain">πŸ“Œ {domain}</span>
307
+ <span class="doc-category">🏷️ {cat}</span>
308
+ <span class="doc-words">πŸ“ {word_count:,} words</span>
309
+ </div>
310
+ <a href="{url}" target="_blank" class="doc-url">{url}</a>
311
+ </div>
312
+ """
313
+
314
+
315
+ def choose(decision: str):
316
+ global current
317
+
318
+ if not current:
319
+ return done_state()
320
+
321
+ url = current.get("metadata", {}).get("url", current.get("url", ""))
322
+ h = doc_hash(url, current.get("text", ""))
323
+ doc_id = current.get("_dataset_key", current.get("id", ""))
324
+ store.add(h, decision, doc_id)
325
+
326
+ if store.session_stats["total"] >= SESSION_LIMIT:
327
+ return done_state()
328
+
329
+ current = loader.next()
330
+
331
+ if not current:
332
+ return done_state()
333
+
334
+ progress_html, progress_num = format_progress()
335
+
336
+ return (
337
+ format_document_info(current),
338
+ current.get("text", ""),
339
+ gr.update(interactive=True),
340
+ gr.update(interactive=True),
341
+ format_stats(),
342
+ progress_html,
343
+ progress_num
344
+ )
345
+
346
+
347
+ def done_state():
348
+ progress_html, progress_num = format_progress()
349
+
350
+ if store.session_stats["total"] >= SESSION_LIMIT:
351
+ message = "πŸŽ‰ Session Complete!"
352
+ description = f"Great job! You've completed your session of {SESSION_LIMIT} documents."
353
+ else:
354
+ message = "πŸŽ‰ All documents annotated!"
355
+ description = "Great job! You've completed all available documents."
356
+
357
+ return (
358
+ f"<div class='done-message'>{message}</div>",
359
+ description,
360
+ gr.update(interactive=False),
361
+ gr.update(interactive=False),
362
+ format_stats(),
363
+ progress_html,
364
+ 1.0
365
+ )
366
+
367
+
368
+ def update_viewer_filter(filter_value: str):
369
+ viewer_state["filter"] = filter_value
370
+ viewer_state["index"] = 0
371
+
372
+ viewer_state["annotations"] = store.get_filtered(filter_value)
373
+
374
+ logger.info(f"Filter: {filter_value}, Found {len(viewer_state['annotations'])} annotations")
375
+
376
+ return update_viewer_display()
377
+
378
+
379
+ def navigate_viewer(direction: int):
380
+ if not viewer_state["annotations"]:
381
+ return update_viewer_display()
382
+
383
+ viewer_state["index"] = (viewer_state["index"] + direction) % len(viewer_state["annotations"])
384
+ return update_viewer_display()
385
+
386
+
387
+ def update_viewer_display():
388
+ if not viewer_state["annotations"]:
389
+ return (
390
+ "<div class='viewer-empty'>No annotations to display</div>",
391
+ "",
392
+ f"0 / 0",
393
+ gr.update(interactive=False),
394
+ gr.update(interactive=False)
395
+ )
396
+
397
+ idx = viewer_state["index"]
398
+ annotation = viewer_state["annotations"][idx]
399
+ doc = loader.get_by_id(annotation["id"])
400
+
401
+ if not doc:
402
+ logger.warning(f"Document not found for ID: {annotation['id']} (type: {type(annotation['id'])})")
403
+ return (
404
+ "<div class='viewer-error'>Document not found in dataset</div>",
405
+ f"Annotation details: {json.dumps(annotation, indent=2)}",
406
+ f"{idx + 1} / {len(viewer_state['annotations'])}",
407
+ gr.update(interactive=idx > 0),
408
+ gr.update(interactive=idx < len(viewer_state["annotations"]) - 1)
409
+ )
410
+
411
+ return (
412
+ format_document_info(doc, annotation),
413
+ doc.get("text", ""),
414
+ f"{idx + 1} / {len(viewer_state['annotations'])}",
415
+ gr.update(interactive=idx > 0),
416
+ gr.update(interactive=idx < len(viewer_state["annotations"]) - 1)
417
+ )
418
+
419
+
420
+ def build() -> gr.Blocks:
421
+ css = """
422
+ .stats-container {
423
+ display: flex;
424
+ gap: 15px;
425
+ margin: 10px 0;
426
+ flex-wrap: nowrap;
427
+ justify-content: space-between;
428
+ }
429
+ .stat-item {
430
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
431
+ border-radius: 12px;
432
+ padding: 15px;
433
+ flex: 1;
434
+ min-width: 100px;
435
+ text-align: center;
436
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
437
+ transition: transform 0.2s;
438
+ }
439
+ .stat-item:hover {
440
+ transform: translateY(-2px);
441
+ }
442
+ .stat-value {
443
+ font-size: 24px;
444
+ font-weight: bold;
445
+ color: white;
446
+ margin-bottom: 3px;
447
+ }
448
+ .stat-label {
449
+ font-size: 12px;
450
+ color: rgba(255, 255, 255, 0.9);
451
+ }
452
+ .progress-container {
453
+ background: #f8f9fa;
454
+ border-radius: 12px;
455
+ padding: 15px;
456
+ margin: 10px 0;
457
+ }
458
+ .progress-header {
459
+ display: flex;
460
+ justify-content: space-between;
461
+ margin-bottom: 10px;
462
+ font-weight: 600;
463
+ }
464
+ .progress-bar-bg {
465
+ background: #e9ecef;
466
+ height: 20px;
467
+ border-radius: 10px;
468
+ overflow: hidden;
469
+ margin-bottom: 10px;
470
+ }
471
+ .progress-bar-fill {
472
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
473
+ height: 100%;
474
+ transition: width 0.3s ease;
475
+ }
476
+ .progress-percentage {
477
+ text-align: center;
478
+ color: #6c757d;
479
+ font-size: 14px;
480
+ }
481
+ .doc-info {
482
+ background: #f8f9fa;
483
+ border-radius: 12px;
484
+ padding: 15px;
485
+ margin-bottom: 10px;
486
+ }
487
+ .doc-meta {
488
+ display: flex;
489
+ gap: 20px;
490
+ margin-bottom: 10px;
491
+ flex-wrap: wrap;
492
+ }
493
+ .doc-meta span {
494
+ font-size: 14px;
495
+ color: #495057;
496
+ }
497
+ .doc-url {
498
+ font-size: 14px;
499
+ color: #667eea;
500
+ text-decoration: none;
501
+ word-break: break-all;
502
+ }
503
+ .doc-url:hover {
504
+ text-decoration: underline;
505
+ }
506
+ .done-message {
507
+ font-size: 32px;
508
+ text-align: center;
509
+ padding: 40px;
510
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
511
+ color: white;
512
+ border-radius: 12px;
513
+ font-weight: bold;
514
+ }
515
+ .annotation-info {
516
+ display: flex;
517
+ justify-content: space-between;
518
+ margin-bottom: 10px;
519
+ padding-left: 10px;
520
+ }
521
+ .annotation-decision {
522
+ font-weight: 600;
523
+ }
524
+ .annotation-time {
525
+ color: #6c757d;
526
+ font-size: 12px;
527
+ }
528
+ .viewer-empty, .viewer-error {
529
+ text-align: center;
530
+ padding: 40px;
531
+ color: #6c757d;
532
+ font-size: 18px;
533
+ }
534
+ .viewer-nav {
535
+ display: flex;
536
+ justify-content: center;
537
+ align-items: center;
538
+ gap: 20px;
539
+ margin: 10px 0;
540
+ }
541
+ .viewer-counter {
542
+ font-weight: 600;
543
+ color: #495057;
544
+ }
545
+ #select {
546
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
547
+ border: none;
548
+ font-size: 18px;
549
+ padding: 12px 24px;
550
+ }
551
+ #discard {
552
+ background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
553
+ border: none;
554
+ font-size: 18px;
555
+ padding: 12px 24px;
556
+ }
557
+ .dark .stat-item {
558
+ background: linear-gradient(135deg, #434343 0%, #000000 100%);
559
+ }
560
+ .dark .progress-container, .dark .doc-info {
561
+ background: #1a1a1a;
562
+ }
563
+ .dark .progress-bar-bg {
564
+ background: #2a2a2a;
565
+ }
566
+ @keyframes pulse {
567
+ 0% { transform: scale(1); }
568
+ 50% { transform: scale(1.05); }
569
+ 100% { transform: scale(1); }
570
+ }
571
+ """
572
+
573
+ shortcut_js = """
574
+ <script>
575
+ function handleKeyboardShortcuts(e) {
576
+ var target = e.target || e.srcElement;
577
+ switch (target.tagName.toLowerCase()) {
578
+ case "input":
579
+ case "textarea":
580
+ case "select":
581
+ case "button":
582
+ return;
583
+ default:
584
+ if (e.code === "Digit1" || e.key === "1") {
585
+ var selectBtn = document.getElementById("select");
586
+ if (selectBtn && !selectBtn.disabled) {
587
+ selectBtn.click();
588
+ e.preventDefault();
589
+ }
590
+ }
591
+ else if (e.code === "Digit2" || e.key === "2") {
592
+ var discardBtn = document.getElementById("discard");
593
+ if (discardBtn && !discardBtn.disabled) {
594
+ discardBtn.click();
595
+ e.preventDefault();
596
+ }
597
+ }
598
+ }
599
+ }
600
+
601
+ document.addEventListener('keyup', handleKeyboardShortcuts, false);
602
+
603
+ document.addEventListener('keydown', function(e) {
604
+ if ((e.code === "Digit1" || e.key === "1") && document.getElementById("select") && !document.getElementById("select").disabled) {
605
+ document.getElementById("select").style.transform = "scale(0.95)";
606
+ }
607
+ if ((e.code === "Digit2" || e.key === "2") && document.getElementById("discard") && !document.getElementById("discard").disabled) {
608
+ document.getElementById("discard").style.transform = "scale(0.95)";
609
+ }
610
+ });
611
+
612
+ document.addEventListener('keyup', function(e) {
613
+ if (e.code === "Digit1" || e.key === "1") {
614
+ var btn = document.getElementById("select");
615
+ if (btn) btn.style.transform = "scale(1)";
616
+ }
617
+ if (e.code === "Digit2" || e.key === "2") {
618
+ var btn = document.getElementById("discard");
619
+ if (btn) btn.style.transform = "scale(1)";
620
+ }
621
+ });
622
+ </script>
623
+ """
624
+
625
+ with gr.Blocks(
626
+ title="Essential Web Annotation",
627
+ theme=gr.themes.Default(),
628
+ css=css,
629
+ head=shortcut_js
630
+ ) as demo:
631
+ gr.Markdown("# πŸš€ Essential Web Annotation Tool")
632
+
633
+ with gr.Tabs():
634
+ with gr.Tab("Annotate"):
635
+ gr.Markdown("""
636
+ ## πŸ“‹ Document Quality Assessment
637
+
638
+ Your task is to evaluate documents for **high-quality, valuable content** that provides generalizable information.
639
+
640
+ ### βœ… **Select High-Quality Documents:**
641
+ Examples include:
642
+ - **Technical blogs** with detailed explanations
643
+ - **Scientific papers** and research articles
644
+ - **Information-rich discussions** with insights
645
+ - **Educational content** with actionable knowledge
646
+ - **Professional documentation** and guides
647
+
648
+ ### ❌ **Discard Low-Quality Documents:**
649
+ - Content with minimal informational value
650
+
651
+ ### 🎯 **Quick Assessment Tips:**
652
+ - High-quality documents are usually immediately recognizable to a human.
653
+ - Use the **Viewer** tab to browse examples of selected documents
654
+ - Trust your judgment on content value and depth
655
+
656
+ ### ⌨️ **Keyboard Shortcuts:**
657
+ | Key | Action |
658
+ |-----|--------|
659
+ | **`1`** | βœ… Select document |
660
+ | **`2`** | ❌ Discard document |
661
+ """)
662
+
663
+ progress_html, progress_num = format_progress()
664
+
665
+ progress_display = gr.HTML(progress_html)
666
+ stats_display = gr.HTML(format_stats())
667
+
668
+ if current:
669
+ doc_info_html = format_document_info(current)
670
+ text_val = current.get("text", "")
671
+ else:
672
+ doc_info_html = "<div class='doc-info'>No documents loaded.</div>"
673
+ text_val = ""
674
+
675
+ doc_info = gr.HTML(doc_info_html)
676
+
677
+ with gr.Column(variant="panel"):
678
+ text_display = gr.Textbox(
679
+ text_val,
680
+ label="πŸ“„ Document Content",
681
+ lines=20,
682
+ interactive=False,
683
+ show_copy_button=True
684
+ )
685
+
686
+ with gr.Row():
687
+ btn_sel = gr.Button(
688
+ "βœ… Select (1)",
689
+ elem_id="select",
690
+ variant="primary",
691
+ interactive=bool(current),
692
+ size="lg"
693
+ )
694
+ btn_dis = gr.Button(
695
+ "❌ Discard (2)",
696
+ elem_id="discard",
697
+ variant="stop",
698
+ interactive=bool(current),
699
+ size="lg"
700
+ )
701
+
702
+ progress_bar = gr.Number(value=progress_num, visible=False)
703
+
704
+ outputs = [doc_info, text_display, btn_sel, btn_dis, stats_display, progress_display, progress_bar]
705
+
706
+ btn_sel.click(lambda: choose("selected"), outputs=outputs)
707
+ btn_dis.click(lambda: choose("discarded"), outputs=outputs)
708
+
709
+ with gr.Tab("Viewer"):
710
+ gr.Markdown("### πŸ“š Browse Annotated Documents")
711
+
712
+ with gr.Row():
713
+ filter_dropdown = gr.Radio(
714
+ choices=["all", "selected", "discarded"],
715
+ value="all",
716
+ label="Filter",
717
+ interactive=True
718
+ )
719
+
720
+ viewer_info = gr.HTML()
721
+
722
+ with gr.Column(variant="panel"):
723
+ viewer_text = gr.Textbox(
724
+ label="πŸ“„ Document Content",
725
+ lines=20,
726
+ interactive=False,
727
+ show_copy_button=True
728
+ )
729
+
730
+ with gr.Row():
731
+ prev_btn = gr.Button("← Previous", size="lg")
732
+ viewer_counter = gr.HTML("<div class='viewer-counter'>0 / 0</div>")
733
+ next_btn = gr.Button("Next β†’", size="lg")
734
+
735
+ filter_dropdown.change(
736
+ update_viewer_filter,
737
+ inputs=[filter_dropdown],
738
+ outputs=[viewer_info, viewer_text, viewer_counter, prev_btn, next_btn]
739
+ )
740
+
741
+ prev_btn.click(
742
+ lambda: navigate_viewer(-1),
743
+ outputs=[viewer_info, viewer_text, viewer_counter, prev_btn, next_btn]
744
+ )
745
+
746
+ next_btn.click(
747
+ lambda: navigate_viewer(1),
748
+ outputs=[viewer_info, viewer_text, viewer_counter, prev_btn, next_btn]
749
+ )
750
+
751
+ demo.load(
752
+ lambda: update_viewer_filter("all"),
753
+ outputs=[viewer_info, viewer_text, viewer_counter, prev_btn, next_btn]
754
+ )
755
+
756
+ gr.HTML("""
757
+ <script>
758
+ const observer = new MutationObserver(() => {
759
+ document.querySelectorAll('.stat-item').forEach(item => {
760
+ item.style.animation = 'pulse 0.3s ease-out';
761
+ });
762
+ });
763
+ observer.observe(document.body, { childList: true, subtree: true });
764
+ </script>
765
+ """)
766
+
767
+ return demo
768
+
769
+
770
+ if __name__ == "__main__":
771
+ build().launch()