Linker1907
commited on
Commit
•
215f60a
1
Parent(s):
5abe4be
added reddit and cc
Browse files
app.py
CHANGED
@@ -1,52 +1,86 @@
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import streamlit.components.v1 as components
|
3 |
-
import json
|
4 |
|
5 |
BAD_EXAMPLES_PATH = "bad_examples"
|
6 |
DATA_PATH = "data"
|
7 |
|
|
|
8 |
def load_jsonl(file_path):
|
9 |
data = []
|
10 |
-
with open(file_path,
|
11 |
for line in f:
|
12 |
data.append(json.loads(line))
|
13 |
|
14 |
return data
|
15 |
|
16 |
|
17 |
-
if
|
18 |
st.session_state.idx = 0
|
19 |
|
|
|
20 |
def get_next_item():
|
21 |
st.session_state.idx += 1
|
22 |
|
23 |
-
def save_and_get_next_item(sample):
|
24 |
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
27 |
|
28 |
get_next_item()
|
29 |
|
30 |
|
31 |
-
datasets = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
dataset = st.sidebar.selectbox("Dataset", datasets)
|
33 |
-
data = load_jsonl(f
|
34 |
|
35 |
# create bad file if it does not exists
|
36 |
-
with open(f
|
37 |
pass
|
38 |
|
39 |
-
st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__(
|
40 |
|
41 |
-
with open(f
|
42 |
-
|
|
|
|
|
43 |
|
44 |
-
st.sidebar.button(
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
with st.form(key=
|
47 |
sample = data[st.session_state.idx]
|
48 |
text = sample["text"]
|
49 |
st.text_area(f"text id: {st.session_state.idx}", text, height=500)
|
50 |
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
import streamlit as st
|
4 |
import streamlit.components.v1 as components
|
|
|
5 |
|
6 |
BAD_EXAMPLES_PATH = "bad_examples"
|
7 |
DATA_PATH = "data"
|
8 |
|
9 |
+
|
10 |
def load_jsonl(file_path):
|
11 |
data = []
|
12 |
+
with open(file_path, "r") as f:
|
13 |
for line in f:
|
14 |
data.append(json.loads(line))
|
15 |
|
16 |
return data
|
17 |
|
18 |
|
19 |
+
if "idx" not in st.session_state:
|
20 |
st.session_state.idx = 0
|
21 |
|
22 |
+
|
23 |
def get_next_item():
|
24 |
st.session_state.idx += 1
|
25 |
|
|
|
26 |
|
27 |
+
def save_and_get_next_item(sample, issue):
|
28 |
+
sample["issue"] = issue
|
29 |
+
|
30 |
+
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
|
31 |
+
f.write(json.dumps(sample) + "\n")
|
32 |
|
33 |
get_next_item()
|
34 |
|
35 |
|
36 |
+
datasets = [
|
37 |
+
"gutenberg_raw",
|
38 |
+
"stackexchange2",
|
39 |
+
"bigcode_python_code",
|
40 |
+
"bigcode_python_github_issues",
|
41 |
+
"bigcode_python_jupyter_scripts_dedup_filtered",
|
42 |
+
"books3",
|
43 |
+
"c4",
|
44 |
+
"s2orc_raw",
|
45 |
+
"reddit_threaded",
|
46 |
+
"cc_filtered_text",
|
47 |
+
]
|
48 |
dataset = st.sidebar.selectbox("Dataset", datasets)
|
49 |
+
data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json")
|
50 |
|
51 |
# create bad file if it does not exists
|
52 |
+
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
|
53 |
pass
|
54 |
|
55 |
+
st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx"))
|
56 |
|
57 |
+
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f:
|
58 |
+
st.sidebar.download_button(
|
59 |
+
"Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl"
|
60 |
+
)
|
61 |
|
62 |
+
st.sidebar.button(
|
63 |
+
"Clear bad examples file",
|
64 |
+
on_click=lambda: open(
|
65 |
+
f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w"
|
66 |
+
).close(),
|
67 |
+
)
|
68 |
|
69 |
+
with st.form(key="bad_form", clear_on_submit=True):
|
70 |
sample = data[st.session_state.idx]
|
71 |
text = sample["text"]
|
72 |
st.text_area(f"text id: {st.session_state.idx}", text, height=500)
|
73 |
|
74 |
+
issue = st.text_input(
|
75 |
+
"What's wrong with this example? (leave blank if example is fine)"
|
76 |
+
)
|
77 |
+
|
78 |
+
good = st.form_submit_button(
|
79 |
+
"GOOD", on_click=get_next_item, use_container_width=True
|
80 |
+
)
|
81 |
+
bad = st.form_submit_button(
|
82 |
+
"BAD",
|
83 |
+
on_click=save_and_get_next_item,
|
84 |
+
args=(sample, issue),
|
85 |
+
use_container_width=True,
|
86 |
+
)
|
bad_examples/c4_bad_examples.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a1d2500082179deff6c62072e3937f3b432f5615eaea968602f59754eb5cd69d
|
3 |
+
size 3314
|
bad_examples/cc_filtered_text_bad_examples.jsonl
ADDED
File without changes
|
bad_examples/gutenberg_raw_bad_examples.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f116395c3f0c07973218d81c31fb2bf59c44b8b4d8f4e8a97a6228656c3a3d93
|
3 |
+
size 145658
|
data/cc_filtered_text_examples_with_stats.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:766f2fa24b0d89d6e9a140416fb95b068ab348a4b860bea0ca7ba37f12d8bfc5
|
3 |
+
size 6953247
|
data/reddit_threaded_examples_with_stats.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:60955d5f50d6643af8bf7253e2beb9b1b703a3059968e3d9d2d424954291b64f
|
3 |
+
size 2295871
|