jeanma commited on
Commit
3fabb88
·
1 Parent(s): 8199442

Add application file

Browse files
Files changed (2) hide show
  1. app.py +91 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from huggingface_hub import hf_hub_download
3
+ import pandas as pd
4
+ import os
5
+ from unicodedata import normalize
6
+ import tempfile
7
+
8
+ HF_TOKEN = os.getenv("HF_TOKEN", None)
9
+
10
+
11
+ def process_file(dataset_type, user_file):
12
+ if user_file is None:
13
+ return gr.Error("Please upload your data.")
14
+
15
+ if dataset_type == "FLORES+ dev":
16
+ reference_file = hf_hub_download(
17
+ repo_id="openlanguagedata/flores_plus",
18
+ filename="dev/eng_Latn.parquet",
19
+ repo_type="dataset",
20
+ use_auth_token=HF_TOKEN,
21
+ )
22
+ elif dataset_type == "FLORES+ devtest":
23
+ reference_file = hf_hub_download(
24
+ repo_id="openlanguagedata/flores_plus",
25
+ filename="devtest/eng_Latn.parquet",
26
+ repo_type="dataset",
27
+ use_auth_token=HF_TOKEN,
28
+ )
29
+ elif dataset_type == "OLDI-Seed":
30
+ reference_file = hf_hub_download(
31
+ repo_id="openlanguagedata/oldi_seed",
32
+ filename="seed/eng_Latn.parquet",
33
+ repo_type="dataset",
34
+ use_auth_token=HF_TOKEN,
35
+ )
36
+ else:
37
+ return gr.Error(f'Invalid dataset type "{dataset_type}".')
38
+
39
+ reference_df = pd.read_parquet(reference_file)
40
+ reference_size = len(reference_df)
41
+
42
+ with open(user_file.name, "rt", encoding="utf-8") as f:
43
+ user_lines = f.readlines()
44
+ user_size = len(user_lines)
45
+
46
+ if reference_size != user_size:
47
+ return gr.Error(
48
+ f"Line count mismatch: reference has {reference_size} rows, "
49
+ f"the file you uploaded has {user_size} lines."
50
+ )
51
+
52
+ def normalise(raw):
53
+ return normalize("NFC", raw).strip()
54
+
55
+ user_data = []
56
+ for i, line in enumerate(user_lines):
57
+ user_data.append(
58
+ {
59
+ "id": i,
60
+ "iso_639_3": "xxx",
61
+ "iso_15924": "Xxxx",
62
+ "glottocode": "xxxx1234",
63
+ "text": normalise(line),
64
+ "last_updated": "2.1",
65
+ }
66
+ )
67
+
68
+ temp_dir = tempfile.mkdtemp()
69
+ filename = "xxx_Xxxx.parquet"
70
+ target_path = os.path.join(temp_dir, filename)
71
+ pd.DataFrame(user_data).to_parquet(target_path, index=False)
72
+
73
+ return target_path
74
+
75
+
76
+ with gr.Blocks() as demo:
77
+ gr.Markdown("# Dataset checker")
78
+ dataset_type = gr.Dropdown(
79
+ ["FLORES+ dev", "FLORES+ devtest", "OLDI-Seed"],
80
+ label="Dataset type",
81
+ )
82
+ dataset_file = gr.File(label="Dataset file")
83
+ parquet_file = gr.File(label="Download Parquet file")
84
+ btn = gr.Button("Check")
85
+ btn.click(
86
+ fn=process_file,
87
+ inputs=[dataset_type, dataset_file],
88
+ outputs=parquet_file,
89
+ )
90
+
91
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ pandas==2.2
2
+ pyarrow==19.0