Spaces:

HuggingFaceGECLM
/

dataset_explorer

Runtime error

App Files Files Community

ola13 commited on Mar 28, 2023

Commit

de3513e

1 Parent(s): a7117c1

Init space

Browse files

Files changed (18) hide show

.gitattributes +1 -0
README.md +2 -2
app.py +194 -0
data/anton_cc_examples_with_stats.json +3 -0
data/bigcode_python_code_examples_with_stats.json +3 -0
data/bigcode_python_github_issues_examples_with_stats.json +3 -0
data/bigcode_python_jupyter_markdowned_clean_dedup_examples_with_stats.json +3 -0
data/bigcode_python_jupyter_scripts_dedup_filtered_examples_with_stats.json +3 -0
data/books3_examples_with_stats.json +3 -0
data/c4_examples_with_stats.json +3 -0
data/cc_filtered_text_examples_with_stats.json +3 -0
data/gutenberg_raw_examples_with_stats.json +3 -0
data/helen_cc_bad_examples_with_stats.json +3 -0
data/helen_cc_good_examples_with_stats.json +3 -0
data/reddit_threaded_examples_with_stats.json +3 -0
data/s2orc_raw_examples_with_stats.json +3 -0
data/stackexchange2_examples_with_stats.json +3 -0
requirements.txt +2 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 title: Dataset Explorer
-emoji: 😻
-colorFrom: green
 colorTo: purple
 sdk: gradio
 sdk_version: 3.23.0

 ---
 title: Dataset Explorer
+emoji: 🌘
+colorFrom: yellow
 colorTo: purple
 sdk: gradio
 sdk_version: 3.23.0

app.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import gradio as gr
+import jsonlines
+import os
+import uuid
+from datetime import datetime
+from huggingface_hub import HfApi
+from pprint import pprint
+datasets = [
+    "gutenberg_raw",
+    "stackexchange2",
+    "bigcode_python_code",
+    "bigcode_python_github_issues",
+    "bigcode_python_jupyter_scripts_dedup_filtered",
+    "books3",
+    "c4",
+    "s2orc_raw",
+    "reddit_threaded",
+    "cc_filtered_text",
+]
+def line_generator(dataset):
+    if dataset == "gutenberg_raw":
+        with jsonlines.open("data/gutenberg_raw_examples_with_stats.json", "r") as f:
+            for line in f:
+                yield line
+    if dataset == "stackexchange2":
+        with jsonlines.open("data/stackexchange2_examples_with_stats.json", "r") as f:
+            for line in f:
+                yield line
+    if dataset == "bigcode_python_code":
+        with jsonlines.open(
+            "data/bigcode_python_code_examples_with_stats.json", "r"
+        ) as f:
+            for line in f:
+                yield line
+    if dataset == "bigcode_python_github_issues":
+        with jsonlines.open(
+            "data/bigcode_python_github_issues_examples_with_stats.json", "r"
+        ) as f:
+            for line in f:
+                yield line
+    if dataset == "bigcode_python_jupyter_scripts_dedup_filtered":
+        with jsonlines.open(
+            "data/bigcode_python_jupyter_scripts_dedup_filtered_examples_with_stats.json",
+            "r",
+        ) as f:
+            for line in f:
+                yield line
+    if dataset == "books3":
+        with jsonlines.open("data/books3_examples_with_stats.json", "r") as f:
+            for line in f:
+                yield line
+    if dataset == "c4":
+        with jsonlines.open("data/c4_examples_with_stats.json", "r") as f:
+            for line in f:
+                yield line
+    if dataset == "s2orc_raw":
+        with jsonlines.open("data/s2orc_raw_examples_with_stats.json", "r") as f:
+            for line in f:
+                yield line
+    if dataset == "reddit_threaded":
+        with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f:
+            for line in f:
+                yield line
+    if dataset == "cc_filtered_text":
+        with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f:
+            for line in f:
+                yield line
+line_generators = {dataset: line_generator(dataset) for dataset in datasets}
+def send_report(sample, dataset, reason, annotator):
+    text = sample["text"]
+    sample.pop("text")
+    sample_id = ""
+    if "id" not in sample:
+        if "title" in sample:
+            sample_id = sample["title"]
+    else:
+        sample_id = sample["id"]
+    print("submitting")
+    pprint(
+        {
+            "dataset": dataset,
+            "docid": sample_id,
+            "text": text,
+            "metadata": sample,
+            "reason": reason,
+            "annotator": annotator,
+            "timestamp": str(datetime.now()),
+        }
+    )
+    with jsonlines.open("report.jsonl", "w") as f:
+        f.write(
+            {
+                "dataset": dataset,
+                "docid": sample_id,
+                "text": text,
+                "metadata": sample,
+                "reason": reason,
+                "annotator": annotator,
+                "timestamp": str(datetime.now()),
+            }
+        )
+    print("geclm_token", os.environ.get("geclm_token"))
+    api = HfApi()
+    api.upload_file(
+        path_or_fileobj="report.jsonl",
+        path_in_repo="report-{}.jsonl".format(uuid.uuid4()),
+        repo_id="HuggingFaceGECLM/data_feedback",
+        repo_type="dataset",
+        token=os.environ.get("geclm_token"),
+    )
+if __name__ == "__main__":
+    demo = gr.Blocks()
+    with demo:
+        current_sample_state = gr.State(dict())
+        with gr.Row():
+            annotator = gr.Textbox(
+                lines=1,
+                max_lines=1,
+                placeholder="Type your name here if you'd like it to be recorded.",
+                label="Annotator",
+            )
+        with gr.Row():
+            dataset = gr.Dropdown(
+                choices=datasets,
+                value="Pick a dataset below",
+                label="Dataset",
+            )
+        with gr.Row():
+            reason_txt = gr.Textbox(
+                label="Flagging reason",
+                placeholder="Provide the reason for flagging if you think the sample is bad.",
+                visible=False,
+            )
+        with gr.Row():
+            bad_btn = gr.Button("Bad", visible=False)
+            good_btn = gr.Button("Next", visible=False)
+        with gr.Row():
+            text = gr.Markdown(visible=False)
+        def next_line(dataset):
+            next_line = next(line_generators[dataset])
+            return [
+                gr.update(value="<pre>" + next_line["text"] + "</pre>", visible=True),
+                next_line,
+                gr.update(visible=True),
+                gr.update(visible=True),
+                gr.update(visible=True),
+            ]
+        def bad_line(current_sample, dataset, reason, annotator):
+            send_report(current_sample, dataset, reason, annotator)
+            next_line = next(line_generators[dataset])
+            return [
+                "<pre>" + next_line["text"] + "</pre>",
+                gr.update(
+                    value="",
+                    placeholder="Provide the reason for flagging if you think the sample is bad.",
+                ),
+                next_line,
+            ]
+        good_btn.click(
+            next_line,
+            inputs=dataset,
+            outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
+        )
+        dataset.change(
+            next_line,
+            inputs=dataset,
+            outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
+        )
+        bad_btn.click(
+            bad_line,
+            inputs=[current_sample_state, dataset, reason_txt, annotator],
+            outputs=[text, reason_txt, current_sample_state],
+        )
+    demo.launch(enable_queue=False, debug=True)

data/anton_cc_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d51628275c8d69b4093333f25124d1739530610d5afab21aa7ce65ae884d101a
+size 28676983

data/bigcode_python_code_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e681c3ae57ba5342a5df6fb426d6f75b0db857d6dfba249c2d8f7a0f1c358888
+size 9894655

data/bigcode_python_github_issues_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1a08c977cfe4a19cc0dbde6fdd76e17b44966d7e7a0fd09725c4ca8d4ee2cb8
+size 17823834

data/bigcode_python_jupyter_markdowned_clean_dedup_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6cc8ed48cd9f3513113608210b154aa44bc19007bfe6a3dc7450f0710db61e2d
+size 10827004

data/bigcode_python_jupyter_scripts_dedup_filtered_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:930708072ae21b8e3234423a0d8a738f31a55e4af01477aefabb86e6b928b17e
+size 8820911

data/books3_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3096e0c9f10abf1a35f93fb26c0ab5bde41ea20a9b41d677577e58cd5fc1657c
+size 505731876

data/c4_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e62506224d8090b48ec2dc724e8943f6e969ad8dfd6c6ae5b8d33478fc815d13
+size 2657133

data/cc_filtered_text_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:766f2fa24b0d89d6e9a140416fb95b068ab348a4b860bea0ca7ba37f12d8bfc5
+size 6953247

data/gutenberg_raw_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85d3804c498fe5624446e222d85918438e09a3604307a1523be131c3890259b3
+size 172318302

data/helen_cc_bad_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67a4b70e0eabe8dd270667b74fff3d0e9c426dd02f98b8f6149245a0f682f019
+size 751776

data/helen_cc_good_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f633d4b01f07b6bee8e90ba9285f7c3a6c71b98d92d43dcbfe490880a0ac4fc
+size 28075699

data/reddit_threaded_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60955d5f50d6643af8bf7253e2beb9b1b703a3059968e3d9d2d424954291b64f
+size 2295871

data/s2orc_raw_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fbcb36fd24dca3e62696609327dcb28e2f38e78d9e32c0d24439e66a5c84b191
+size 25281345

data/stackexchange2_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67e4519399245b056952ec073f00dd6e6e94895e2b1052def39044625537a794
+size 5947625

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ huggingface_hub
2	+ jsonlines