Linker1907 commited on
Commit
215f60a
1 Parent(s): 5abe4be

added reddit and cc

Browse files
app.py CHANGED
@@ -1,52 +1,86 @@
 
 
1
  import streamlit as st
2
  import streamlit.components.v1 as components
3
- import json
4
 
5
  BAD_EXAMPLES_PATH = "bad_examples"
6
  DATA_PATH = "data"
7
 
 
8
  def load_jsonl(file_path):
9
  data = []
10
- with open(file_path, 'r') as f:
11
  for line in f:
12
  data.append(json.loads(line))
13
 
14
  return data
15
 
16
 
17
- if 'idx' not in st.session_state:
18
  st.session_state.idx = 0
19
 
 
20
  def get_next_item():
21
  st.session_state.idx += 1
22
 
23
- def save_and_get_next_item(sample):
24
 
25
- with open(f'{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl', 'a') as f:
26
- f.write(json.dumps(sample) + '\n')
 
 
 
27
 
28
  get_next_item()
29
 
30
 
31
- datasets = ['gutenberg_raw', "stackexchange2", "bigcode_python_code", "bigcode_python_github_issues", "bigcode_python_jupyter_scripts_dedup_filtered", "books3", "c4", "s2orc_raw"]
 
 
 
 
 
 
 
 
 
 
 
32
  dataset = st.sidebar.selectbox("Dataset", datasets)
33
- data = load_jsonl(f'{DATA_PATH}/{dataset}_examples_with_stats.json')
34
 
35
  # create bad file if it does not exists
36
- with open(f'{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl', 'a') as f:
37
  pass
38
 
39
- st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__('idx'))
40
 
41
- with open(f'{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl', "r+") as f:
42
- st.sidebar.download_button('Download bad example JSON file', f)
 
 
43
 
44
- st.sidebar.button("Clear bad examples file", on_click=lambda: open(f'{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl', 'w').close())
 
 
 
 
 
45
 
46
- with st.form(key='checkbox', clear_on_submit=True):
47
  sample = data[st.session_state.idx]
48
  text = sample["text"]
49
  st.text_area(f"text id: {st.session_state.idx}", text, height=500)
50
 
51
- good = st.form_submit_button('GOOD', on_click=get_next_item)
52
- bad = st.form_submit_button('BAD', on_click=save_and_get_next_item, args=(sample,))
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
  import streamlit as st
4
  import streamlit.components.v1 as components
 
5
 
6
  BAD_EXAMPLES_PATH = "bad_examples"
7
  DATA_PATH = "data"
8
 
9
+
10
  def load_jsonl(file_path):
11
  data = []
12
+ with open(file_path, "r") as f:
13
  for line in f:
14
  data.append(json.loads(line))
15
 
16
  return data
17
 
18
 
19
+ if "idx" not in st.session_state:
20
  st.session_state.idx = 0
21
 
22
+
23
  def get_next_item():
24
  st.session_state.idx += 1
25
 
 
26
 
27
+ def save_and_get_next_item(sample, issue):
28
+ sample["issue"] = issue
29
+
30
+ with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
31
+ f.write(json.dumps(sample) + "\n")
32
 
33
  get_next_item()
34
 
35
 
36
+ datasets = [
37
+ "gutenberg_raw",
38
+ "stackexchange2",
39
+ "bigcode_python_code",
40
+ "bigcode_python_github_issues",
41
+ "bigcode_python_jupyter_scripts_dedup_filtered",
42
+ "books3",
43
+ "c4",
44
+ "s2orc_raw",
45
+ "reddit_threaded",
46
+ "cc_filtered_text",
47
+ ]
48
  dataset = st.sidebar.selectbox("Dataset", datasets)
49
+ data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json")
50
 
51
  # create bad file if it does not exists
52
+ with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
53
  pass
54
 
55
+ st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx"))
56
 
57
+ with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f:
58
+ st.sidebar.download_button(
59
+ "Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl"
60
+ )
61
 
62
+ st.sidebar.button(
63
+ "Clear bad examples file",
64
+ on_click=lambda: open(
65
+ f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w"
66
+ ).close(),
67
+ )
68
 
69
+ with st.form(key="bad_form", clear_on_submit=True):
70
  sample = data[st.session_state.idx]
71
  text = sample["text"]
72
  st.text_area(f"text id: {st.session_state.idx}", text, height=500)
73
 
74
+ issue = st.text_input(
75
+ "What's wrong with this example? (leave blank if example is fine)"
76
+ )
77
+
78
+ good = st.form_submit_button(
79
+ "GOOD", on_click=get_next_item, use_container_width=True
80
+ )
81
+ bad = st.form_submit_button(
82
+ "BAD",
83
+ on_click=save_and_get_next_item,
84
+ args=(sample, issue),
85
+ use_container_width=True,
86
+ )
bad_examples/c4_bad_examples.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1d2500082179deff6c62072e3937f3b432f5615eaea968602f59754eb5cd69d
3
+ size 3314
bad_examples/cc_filtered_text_bad_examples.jsonl ADDED
File without changes
bad_examples/gutenberg_raw_bad_examples.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f116395c3f0c07973218d81c31fb2bf59c44b8b4d8f4e8a97a6228656c3a3d93
3
+ size 145658
data/cc_filtered_text_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:766f2fa24b0d89d6e9a140416fb95b068ab348a4b860bea0ca7ba37f12d8bfc5
3
+ size 6953247
data/reddit_threaded_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60955d5f50d6643af8bf7253e2beb9b1b703a3059968e3d9d2d424954291b64f
3
+ size 2295871