Commit
·
215f60a
1
Parent(s):
5abe4be
added reddit and cc
Browse files
app.py
CHANGED
|
@@ -1,52 +1,86 @@
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import streamlit.components.v1 as components
|
| 3 |
-
import json
|
| 4 |
|
| 5 |
BAD_EXAMPLES_PATH = "bad_examples"
|
| 6 |
DATA_PATH = "data"
|
| 7 |
|
|
|
|
| 8 |
def load_jsonl(file_path):
|
| 9 |
data = []
|
| 10 |
-
with open(file_path,
|
| 11 |
for line in f:
|
| 12 |
data.append(json.loads(line))
|
| 13 |
|
| 14 |
return data
|
| 15 |
|
| 16 |
|
| 17 |
-
if
|
| 18 |
st.session_state.idx = 0
|
| 19 |
|
|
|
|
| 20 |
def get_next_item():
|
| 21 |
st.session_state.idx += 1
|
| 22 |
|
| 23 |
-
def save_and_get_next_item(sample):
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
get_next_item()
|
| 29 |
|
| 30 |
|
| 31 |
-
datasets = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
dataset = st.sidebar.selectbox("Dataset", datasets)
|
| 33 |
-
data = load_jsonl(f
|
| 34 |
|
| 35 |
# create bad file if it does not exists
|
| 36 |
-
with open(f
|
| 37 |
pass
|
| 38 |
|
| 39 |
-
st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__(
|
| 40 |
|
| 41 |
-
with open(f
|
| 42 |
-
|
|
|
|
|
|
|
| 43 |
|
| 44 |
-
st.sidebar.button(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
with st.form(key=
|
| 47 |
sample = data[st.session_state.idx]
|
| 48 |
text = sample["text"]
|
| 49 |
st.text_area(f"text id: {st.session_state.idx}", text, height=500)
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
import streamlit as st
|
| 4 |
import streamlit.components.v1 as components
|
|
|
|
| 5 |
|
| 6 |
BAD_EXAMPLES_PATH = "bad_examples"
|
| 7 |
DATA_PATH = "data"
|
| 8 |
|
| 9 |
+
|
| 10 |
def load_jsonl(file_path):
|
| 11 |
data = []
|
| 12 |
+
with open(file_path, "r") as f:
|
| 13 |
for line in f:
|
| 14 |
data.append(json.loads(line))
|
| 15 |
|
| 16 |
return data
|
| 17 |
|
| 18 |
|
| 19 |
+
if "idx" not in st.session_state:
|
| 20 |
st.session_state.idx = 0
|
| 21 |
|
| 22 |
+
|
| 23 |
def get_next_item():
|
| 24 |
st.session_state.idx += 1
|
| 25 |
|
|
|
|
| 26 |
|
| 27 |
+
def save_and_get_next_item(sample, issue):
|
| 28 |
+
sample["issue"] = issue
|
| 29 |
+
|
| 30 |
+
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
|
| 31 |
+
f.write(json.dumps(sample) + "\n")
|
| 32 |
|
| 33 |
get_next_item()
|
| 34 |
|
| 35 |
|
| 36 |
+
datasets = [
|
| 37 |
+
"gutenberg_raw",
|
| 38 |
+
"stackexchange2",
|
| 39 |
+
"bigcode_python_code",
|
| 40 |
+
"bigcode_python_github_issues",
|
| 41 |
+
"bigcode_python_jupyter_scripts_dedup_filtered",
|
| 42 |
+
"books3",
|
| 43 |
+
"c4",
|
| 44 |
+
"s2orc_raw",
|
| 45 |
+
"reddit_threaded",
|
| 46 |
+
"cc_filtered_text",
|
| 47 |
+
]
|
| 48 |
dataset = st.sidebar.selectbox("Dataset", datasets)
|
| 49 |
+
data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json")
|
| 50 |
|
| 51 |
# create bad file if it does not exists
|
| 52 |
+
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
|
| 53 |
pass
|
| 54 |
|
| 55 |
+
st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx"))
|
| 56 |
|
| 57 |
+
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f:
|
| 58 |
+
st.sidebar.download_button(
|
| 59 |
+
"Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl"
|
| 60 |
+
)
|
| 61 |
|
| 62 |
+
st.sidebar.button(
|
| 63 |
+
"Clear bad examples file",
|
| 64 |
+
on_click=lambda: open(
|
| 65 |
+
f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w"
|
| 66 |
+
).close(),
|
| 67 |
+
)
|
| 68 |
|
| 69 |
+
with st.form(key="bad_form", clear_on_submit=True):
|
| 70 |
sample = data[st.session_state.idx]
|
| 71 |
text = sample["text"]
|
| 72 |
st.text_area(f"text id: {st.session_state.idx}", text, height=500)
|
| 73 |
|
| 74 |
+
issue = st.text_input(
|
| 75 |
+
"What's wrong with this example? (leave blank if example is fine)"
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
good = st.form_submit_button(
|
| 79 |
+
"GOOD", on_click=get_next_item, use_container_width=True
|
| 80 |
+
)
|
| 81 |
+
bad = st.form_submit_button(
|
| 82 |
+
"BAD",
|
| 83 |
+
on_click=save_and_get_next_item,
|
| 84 |
+
args=(sample, issue),
|
| 85 |
+
use_container_width=True,
|
| 86 |
+
)
|
bad_examples/c4_bad_examples.jsonl
CHANGED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a1d2500082179deff6c62072e3937f3b432f5615eaea968602f59754eb5cd69d
|
| 3 |
+
size 3314
|
bad_examples/cc_filtered_text_bad_examples.jsonl
ADDED
|
File without changes
|
bad_examples/gutenberg_raw_bad_examples.jsonl
CHANGED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f116395c3f0c07973218d81c31fb2bf59c44b8b4d8f4e8a97a6228656c3a3d93
|
| 3 |
+
size 145658
|
data/cc_filtered_text_examples_with_stats.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:766f2fa24b0d89d6e9a140416fb95b068ab348a4b860bea0ca7ba37f12d8bfc5
|
| 3 |
+
size 6953247
|
data/reddit_threaded_examples_with_stats.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60955d5f50d6643af8bf7253e2beb9b1b703a3059968e3d9d2d424954291b64f
|
| 3 |
+
size 2295871
|