Add flagging to Google Form
Browse files- app.py +76 -3
- bad_examples/bigcode_python_code_bad_examples.jsonl +3 -0
- bad_examples/bigcode_python_github_issues_bad_examples.jsonl +3 -0
- bad_examples/bigcode_python_jupyter_scripts_dedup_filtered_bad_examples.jsonl +3 -0
- bad_examples/books3_bad_examples.jsonl +3 -0
- bad_examples/c4_bad_examples.jsonl +2 -2
- bad_examples/gutenberg_raw_bad_examples.jsonl +2 -2
- bad_examples/reddit_threaded_bad_examples.jsonl +3 -0
- bad_examples/s2orc_raw_bad_examples.jsonl +3 -0
- bad_examples/stackexchange2_bad_examples.jsonl +2 -2
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -1,10 +1,61 @@
|
|
| 1 |
import json
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import streamlit as st
|
| 4 |
import streamlit.components.v1 as components
|
|
|
|
| 5 |
|
| 6 |
BAD_EXAMPLES_PATH = "bad_examples"
|
| 7 |
DATA_PATH = "data"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
def load_jsonl(file_path):
|
|
@@ -24,12 +75,33 @@ def get_next_item():
|
|
| 24 |
st.session_state.idx += 1
|
| 25 |
|
| 26 |
|
| 27 |
-
def
|
| 28 |
sample["issue"] = issue
|
| 29 |
|
| 30 |
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
|
| 31 |
f.write(json.dumps(sample) + "\n")
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
get_next_item()
|
| 34 |
|
| 35 |
|
|
@@ -76,10 +148,11 @@ with st.form(key="bad_form", clear_on_submit=True):
|
|
| 76 |
)
|
| 77 |
|
| 78 |
good = st.form_submit_button(
|
| 79 |
-
"GOOD",
|
|
|
|
| 80 |
)
|
| 81 |
bad = st.form_submit_button(
|
| 82 |
"BAD",
|
| 83 |
-
on_click=
|
| 84 |
args=(sample, issue),
|
| 85 |
)
|
|
|
|
| 1 |
import json
|
| 2 |
+
import math
|
| 3 |
+
from functools import partial
|
| 4 |
|
| 5 |
import streamlit as st
|
| 6 |
import streamlit.components.v1 as components
|
| 7 |
+
from gforms import Form
|
| 8 |
|
| 9 |
BAD_EXAMPLES_PATH = "bad_examples"
|
| 10 |
DATA_PATH = "data"
|
| 11 |
+
MAX_DOC_LENGTH = 30000
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def form_callback(
|
| 15 |
+
element,
|
| 16 |
+
page_index,
|
| 17 |
+
element_index,
|
| 18 |
+
dataset,
|
| 19 |
+
docid,
|
| 20 |
+
text,
|
| 21 |
+
metadata,
|
| 22 |
+
reason,
|
| 23 |
+
person,
|
| 24 |
+
part,
|
| 25 |
+
):
|
| 26 |
+
if element.name == "Dataset":
|
| 27 |
+
return dataset
|
| 28 |
+
if element.name == "Datapoint ID":
|
| 29 |
+
return docid
|
| 30 |
+
if element.name == "Text":
|
| 31 |
+
return text
|
| 32 |
+
if element.name == "Metadata":
|
| 33 |
+
return metadata
|
| 34 |
+
if element.name == "Flagging Reason":
|
| 35 |
+
return reason
|
| 36 |
+
if element.name == "Flagging Person":
|
| 37 |
+
return person
|
| 38 |
+
if element.name == "Part":
|
| 39 |
+
return part
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def report_result(dataset, docid, text, metadata, reason, person, part):
|
| 43 |
+
form = Form()
|
| 44 |
+
FORM_URL = "https://docs.google.com/forms/d/e/1FAIpQLSedYTj1pBD5L6xo6qPUKY5vleNW183FXCgc3LSSgg3AUwZWKA/viewform"
|
| 45 |
+
form.load(FORM_URL)
|
| 46 |
+
form.fill(
|
| 47 |
+
partial(
|
| 48 |
+
form_callback,
|
| 49 |
+
dataset=dataset,
|
| 50 |
+
docid=docid,
|
| 51 |
+
text=text,
|
| 52 |
+
metadata=metadata,
|
| 53 |
+
reason=reason,
|
| 54 |
+
person=person,
|
| 55 |
+
part=part,
|
| 56 |
+
),
|
| 57 |
+
)
|
| 58 |
+
form.submit()
|
| 59 |
|
| 60 |
|
| 61 |
def load_jsonl(file_path):
|
|
|
|
| 75 |
st.session_state.idx += 1
|
| 76 |
|
| 77 |
|
| 78 |
+
def save_flag_and_get_next_item(sample, issue):
|
| 79 |
sample["issue"] = issue
|
| 80 |
|
| 81 |
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
|
| 82 |
f.write(json.dumps(sample) + "\n")
|
| 83 |
|
| 84 |
+
text = sample["text"]
|
| 85 |
+
|
| 86 |
+
sample.pop("text")
|
| 87 |
+
sample.pop("issue")
|
| 88 |
+
sample_id = ""
|
| 89 |
+
if "id" not in sample:
|
| 90 |
+
if "title" in sample:
|
| 91 |
+
sample_id = sample["title"]
|
| 92 |
+
else:
|
| 93 |
+
sample_id = sample["id"]
|
| 94 |
+
|
| 95 |
+
if len(text) > MAX_DOC_LENGTH:
|
| 96 |
+
num_parts = math.ceil(len(text) / MAX_DOC_LENGTH)
|
| 97 |
+
for i in range(num_parts):
|
| 98 |
+
text_portion = text[i * MAX_DOC_LENGTH : (i + 1) * MAX_DOC_LENGTH]
|
| 99 |
+
report_result(
|
| 100 |
+
dataset, sample_id, text_portion, str(sample), issue, "", str(i)
|
| 101 |
+
)
|
| 102 |
+
else:
|
| 103 |
+
report_result(dataset, sample_id, text, str(sample), issue, "", str(0))
|
| 104 |
+
|
| 105 |
get_next_item()
|
| 106 |
|
| 107 |
|
|
|
|
| 148 |
)
|
| 149 |
|
| 150 |
good = st.form_submit_button(
|
| 151 |
+
"GOOD",
|
| 152 |
+
on_click=get_next_item,
|
| 153 |
)
|
| 154 |
bad = st.form_submit_button(
|
| 155 |
"BAD",
|
| 156 |
+
on_click=save_flag_and_get_next_item,
|
| 157 |
args=(sample, issue),
|
| 158 |
)
|
bad_examples/bigcode_python_code_bad_examples.jsonl
CHANGED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:deca29f2463d96422b301c1ca4af444e1f1dad66764a2835db243fd1a7abc3c3
|
| 3 |
+
size 3250
|
bad_examples/bigcode_python_github_issues_bad_examples.jsonl
CHANGED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3bfc92a7f740e92393f314bac702965dd47a8084bd093d63632865fda5bb11b0
|
| 3 |
+
size 2876
|
bad_examples/bigcode_python_jupyter_scripts_dedup_filtered_bad_examples.jsonl
CHANGED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3184586d973ef844d86995c33d1439dfeef8faca7813a0cdd80f0d22ca9d84fa
|
| 3 |
+
size 7802
|
bad_examples/books3_bad_examples.jsonl
CHANGED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d011167cb7679eb46b595af10c2965efa2e36ce8085f2c6fe8a7c5d3a28e54d0
|
| 3 |
+
size 452432
|
bad_examples/c4_bad_examples.jsonl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2415a6bc59f376c1535f1eb2c6854b9d13a75842675b3d8231e5d81999d865b2
|
| 3 |
+
size 8618
|
bad_examples/gutenberg_raw_bad_examples.jsonl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5d4a0bce4796569315e6af2b9f2313204d5cf108b21b69bb01a27a98b56ff643
|
| 3 |
+
size 2394572
|
bad_examples/reddit_threaded_bad_examples.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90cccebb836615224b151fe1576ad3667933d425bc16e0e8f231671e151b0dbb
|
| 3 |
+
size 2971
|
bad_examples/s2orc_raw_bad_examples.jsonl
CHANGED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0dc4d3ee6ca348b2cd56294e65ff268c73905aec89856e4645bfa4aea108d573
|
| 3 |
+
size 15219
|
bad_examples/stackexchange2_bad_examples.jsonl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5d56866a48e1cd99a0bc80ab9088bf7f28e7a861d91a02630252f8fad676147b
|
| 3 |
+
size 41965
|
requirements.txt
CHANGED
|
@@ -1 +1,2 @@
|
|
|
|
|
| 1 |
streamlit==1.20.0
|
|
|
|
| 1 |
+
gforms
|
| 2 |
streamlit==1.20.0
|