omkarenator commited on
Commit
87a6313
1 Parent(s): 5171d34

unify data viewer, DV, DV2, DVS

Browse files
Files changed (4) hide show
  1. curated.py +4 -0
  2. data_viewer.py +157 -0
  3. main.py +5 -2
  4. web.py +2 -156
curated.py CHANGED
@@ -5,6 +5,7 @@ from fh_plotly import plotly2fasthtml
5
  import pandas as pd
6
  import json
7
  from data_viewer import view_data, gen_random_id
 
8
  from rich import print
9
  import uuid
10
  import plotly.express as px
@@ -485,6 +486,9 @@ wiki_examples = Div(
485
  ),
486
  )
487
 
 
 
 
488
  def get_freelaw_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"):
489
  doc_id = max(0, min(int(doc_id), 9))
490
 
 
5
  import pandas as pd
6
  import json
7
  from data_viewer import view_data, gen_random_id
8
+ from data_viewer import DV, DV2, DVS
9
  from rich import print
10
  import uuid
11
  import plotly.express as px
 
486
  ),
487
  )
488
 
489
+ wiki_examples = DV("data/curated_samples/wiki.json", 0, "Wikipedia")
490
+
491
+
492
  def get_freelaw_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"):
493
  doc_id = max(0, min(int(doc_id), 9))
494
 
data_viewer.py CHANGED
@@ -3,6 +3,7 @@ from fasthtml.components import *
3
  import json
4
  import string
5
  import random
 
6
 
7
 
8
  def gen_random_id() -> str:
@@ -79,3 +80,159 @@ def view_data(
79
  style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
80
  )
81
  return Div(form, data_display, style="margin-top: 10px;", id=target)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import json
4
  import string
5
  import random
6
+ import jsonlines
7
 
8
 
9
  def gen_random_id() -> str:
 
80
  style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
81
  )
82
  return Div(form, data_display, style="margin-top: 10px;", id=target)
83
+
84
+
85
+
86
+ def DVS(
87
+ left,
88
+ header,
89
+ ):
90
+ col1 = Div(
91
+ Pre(
92
+ json.dumps(left, indent=4, ensure_ascii=False),
93
+ style="white-space: pre-wrap; word-break: break-all;",
94
+ ),
95
+ style="float: left; overflow-x: auto;",
96
+ )
97
+
98
+ data_display = Div(
99
+ col1,
100
+ style="overflow: auto; clear: both; height: 200px; border: 1px solid #ccc; padding: 20px;",
101
+ )
102
+ return Div(H3(header), data_display, style="margin-top: 10px;")
103
+
104
+
105
+ def DV(
106
+ left_file,
107
+ doc_id,
108
+ header,
109
+ target: str = None,
110
+ ):
111
+ if target is None:
112
+ target = "".join(random.choices(string.ascii_lowercase, k=8))
113
+
114
+ if left_file.endswith("jsonl"):
115
+ left = [x for x in jsonlines.open(left_file)]
116
+ else:
117
+ left = json.load(open(left_file, encoding="utf-8"))
118
+ max_doc_id = len(left) - 1
119
+ slider = Input(
120
+ type="range",
121
+ name=f"doc_id_{target}",
122
+ min="0",
123
+ max=str(max_doc_id),
124
+ value=str(doc_id),
125
+ hx_get=f"/update/{target}",
126
+ hx_target=f"#{target}",
127
+ hx_trigger="change",
128
+ hx_swap="innerHTML",
129
+ hx_vals=json.dumps({"left_file": f"{left_file}", "header": f"{header}"}),
130
+ )
131
+
132
+ form = Div(
133
+ H3(header),
134
+ Label(
135
+ "Data sample: ", slider, f"{doc_id} of {max_doc_id}", cls="plotly_slider"
136
+ ),
137
+ cls="plotly_input_container",
138
+ style="padding: 20px;",
139
+ )
140
+
141
+ col1 = Div(
142
+ Pre(
143
+ json.dumps(left[doc_id], indent=4, ensure_ascii=False),
144
+ style="white-space: pre-wrap; word-break: break-all;",
145
+ ),
146
+ style="float: left; overflow-x: auto;",
147
+ )
148
+
149
+ data_display = Div(
150
+ col1,
151
+ style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
152
+ )
153
+ return Div(form, data_display, style="margin-top: 10px;", id=target)
154
+
155
+
156
+ def DV2(
157
+ left_file,
158
+ right_file,
159
+ doc_id,
160
+ target: str = None,
161
+ ):
162
+ if target is None:
163
+ target = "".join(random.choices(string.ascii_lowercase, k=8))
164
+
165
+ left = json.load(open(left_file, encoding="utf-8"))
166
+ right = json.load(open(right_file, encoding="utf-8"))
167
+ max_doc_id = len(left) - 1
168
+
169
+ slider = Input(
170
+ type="range",
171
+ name=f"doc_id_{target}",
172
+ min="0",
173
+ max=str(max_doc_id),
174
+ value=str(doc_id),
175
+ hx_get=f"/update/{target}",
176
+ hx_target=f"#{target}",
177
+ hx_trigger="change",
178
+ hx_swap="innerHTML",
179
+ hx_vals=json.dumps(
180
+ {"left_file": f"{left_file}", "right_file": f"{right_file}"}
181
+ ),
182
+ )
183
+
184
+ form = Div(
185
+ Label(
186
+ "Data sample: ", slider, f"{doc_id} of {max_doc_id}", cls="plotly_slider"
187
+ ),
188
+ cls="plotly_input_container",
189
+ style="padding: 20px;",
190
+ )
191
+
192
+ col1 = Div(
193
+ H3("Raw format", style="margin-top: 0px;"),
194
+ Pre(
195
+ json.dumps(left[doc_id], indent=4, ensure_ascii=False),
196
+ style="white-space: pre-wrap; word-break: break-all;",
197
+ ),
198
+ style="width: 48%; float: left; overflow-x: auto;",
199
+ )
200
+
201
+ col2 = Div(
202
+ H3("Extracted format", style="margin-top: 0px;"),
203
+ Pre(
204
+ json.dumps(right[doc_id], indent=4, ensure_ascii=False),
205
+ style="white-space: pre-wrap; word-break: break-all;",
206
+ ),
207
+ style="width: 48%; float: right; overflow-x: auto;",
208
+ )
209
+
210
+ data_display = Div(
211
+ col1,
212
+ col2,
213
+ style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
214
+ )
215
+ return Div(form, data_display, style="margin-top: 10px;", id=target)
216
+
217
+
218
+ def update(target: str, request):
219
+ params = request.query_params
220
+ doc_id = int(params.get(f"doc_id_{target}", 3))
221
+ left_file = params.get("left_file")
222
+ right_file = params.get("right_file")
223
+ if left_file and right_file:
224
+ return (
225
+ DV2(
226
+ left_file,
227
+ right_file,
228
+ doc_id,
229
+ target,
230
+ ),
231
+ )
232
+ else:
233
+ return DV(
234
+ left_file,
235
+ doc_id,
236
+ params.get("header"),
237
+ target,
238
+ )
main.py CHANGED
@@ -21,6 +21,7 @@ import web
21
  import common
22
  import results
23
  from pybtex.database import parse_file
 
24
 
25
 
26
  app, rt = fast_app(
@@ -541,11 +542,13 @@ def intro():
541
 
542
 
543
  #rt("/overview")(overview.overview)
 
 
 
 
544
  rt("/curated")(curated.curated)
545
- rt("/curated/{target}")(curated.update)
546
 
547
  rt("/webdata")(web.web_data)
548
- rt("/webdata/{target}")(web.update)
549
 
550
  rt("/common")(common.common_steps)
551
 
 
21
  import common
22
  import results
23
  from pybtex.database import parse_file
24
+ import data_viewer
25
 
26
 
27
  app, rt = fast_app(
 
542
 
543
 
544
  #rt("/overview")(overview.overview)
545
+
546
+ rt("/update/{target}")(data_viewer.update)
547
+
548
+
549
  rt("/curated")(curated.curated)
 
550
 
551
  rt("/webdata")(web.web_data)
 
552
 
553
  rt("/common")(common.common_steps)
554
 
web.py CHANGED
@@ -7,9 +7,11 @@ from rich import print
7
  import jsonlines
8
  from data.url_blocklist import urls_high_matches, urls_false_positives
9
  from data.non_web_urls import non_web_urls
 
10
  from fasthtml.components import D_code
11
  import pandas as pd
12
 
 
13
  data_filtering_table_data = pd.DataFrame(
14
  {
15
  "Dataset": [
@@ -176,162 +178,6 @@ table_html_qf_filter_data = qf_filtering_table_data.to_html(index=False, border=
176
  table_div_qf_filter_data = Div(NotStr(table_html_qf_filter_data), style="margin: 40px;")
177
 
178
 
179
- def DVS(
180
- left,
181
- header,
182
- ):
183
- col1 = Div(
184
- Pre(
185
- json.dumps(left, indent=4, ensure_ascii=False),
186
- style="white-space: pre-wrap; word-break: break-all;",
187
- ),
188
- style="float: left; overflow-x: auto;",
189
- )
190
-
191
- data_display = Div(
192
- col1,
193
- style="overflow: auto; clear: both; height: 200px; border: 1px solid #ccc; padding: 20px;",
194
- )
195
- return Div(H3(header), data_display, style="margin-top: 10px;")
196
-
197
-
198
- def DV(
199
- left_file,
200
- doc_id,
201
- header,
202
- target: str = None,
203
- ):
204
- if target is None:
205
- target = "".join(random.choices(string.ascii_lowercase, k=8))
206
-
207
- if left_file.endswith("jsonl"):
208
- left = [x for x in jsonlines.open(left_file)]
209
- else:
210
- left = json.load(open(left_file, encoding="utf-8"))
211
- max_doc_id = len(left) - 1
212
- slider = Input(
213
- type="range",
214
- name=f"doc_id_{target}",
215
- min="0",
216
- max=str(max_doc_id),
217
- value=str(doc_id),
218
- hx_get=f"/webdata/{target}",
219
- hx_target=f"#{target}",
220
- hx_trigger="change",
221
- hx_swap="innerHTML",
222
- hx_vals=json.dumps({"left_file": f"{left_file}", "header": f"{header}"}),
223
- )
224
-
225
- form = Div(
226
- H3(header),
227
- Label(
228
- "Data sample: ", slider, f"{doc_id} of {max_doc_id}", cls="plotly_slider"
229
- ),
230
- cls="plotly_input_container",
231
- style="padding: 20px;",
232
- )
233
-
234
- col1 = Div(
235
- Pre(
236
- json.dumps(left[doc_id], indent=4, ensure_ascii=False),
237
- style="white-space: pre-wrap; word-break: break-all;",
238
- ),
239
- style="float: left; overflow-x: auto;",
240
- )
241
-
242
- data_display = Div(
243
- col1,
244
- style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
245
- )
246
- return Div(form, data_display, style="margin-top: 10px;", id=target)
247
-
248
-
249
- def DV2(
250
- left_file,
251
- right_file,
252
- doc_id,
253
- target: str = None,
254
- ):
255
- if target is None:
256
- target = "".join(random.choices(string.ascii_lowercase, k=8))
257
-
258
- left = json.load(open(left_file, encoding="utf-8"))
259
- right = json.load(open(right_file, encoding="utf-8"))
260
- max_doc_id = len(left) - 1
261
-
262
- slider = Input(
263
- type="range",
264
- name=f"doc_id_{target}",
265
- min="0",
266
- max=str(max_doc_id),
267
- value=str(doc_id),
268
- hx_get=f"/webdata/{target}",
269
- hx_target=f"#{target}",
270
- hx_trigger="change",
271
- hx_swap="innerHTML",
272
- hx_vals=json.dumps(
273
- {"left_file": f"{left_file}", "right_file": f"{right_file}"}
274
- ),
275
- )
276
-
277
- form = Div(
278
- Label(
279
- "Data sample: ", slider, f"{doc_id} of {max_doc_id}", cls="plotly_slider"
280
- ),
281
- cls="plotly_input_container",
282
- style="padding: 20px;",
283
- )
284
-
285
- col1 = Div(
286
- H3("Raw format", style="margin-top: 0px;"),
287
- Pre(
288
- json.dumps(left[doc_id], indent=4, ensure_ascii=False),
289
- style="white-space: pre-wrap; word-break: break-all;",
290
- ),
291
- style="width: 48%; float: left; overflow-x: auto;",
292
- )
293
-
294
- col2 = Div(
295
- H3("Extracted format", style="margin-top: 0px;"),
296
- Pre(
297
- json.dumps(right[doc_id], indent=4, ensure_ascii=False),
298
- style="white-space: pre-wrap; word-break: break-all;",
299
- ),
300
- style="width: 48%; float: right; overflow-x: auto;",
301
- )
302
-
303
- data_display = Div(
304
- col1,
305
- col2,
306
- style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
307
- )
308
- return Div(form, data_display, style="margin-top: 10px;", id=target)
309
-
310
-
311
- def update(target: str, request):
312
- params = request.query_params
313
- print(params)
314
- doc_id = int(params.get(f"doc_id_{target}", 3))
315
- left_file = params.get("left_file")
316
- right_file = params.get("right_file")
317
- if left_file and right_file:
318
- return (
319
- DV2(
320
- left_file,
321
- right_file,
322
- doc_id,
323
- target,
324
- ),
325
- )
326
- else:
327
- return DV(
328
- left_file,
329
- doc_id,
330
- params.get("header"),
331
- target,
332
- )
333
-
334
-
335
  dolma311 = """
336
  words = text.split()
337
  word_count = len(words)
 
7
  import jsonlines
8
  from data.url_blocklist import urls_high_matches, urls_false_positives
9
  from data.non_web_urls import non_web_urls
10
+ from data_viewer import DV, DV2, DVS
11
  from fasthtml.components import D_code
12
  import pandas as pd
13
 
14
+
15
  data_filtering_table_data = pd.DataFrame(
16
  {
17
  "Dataset": [
 
178
  table_div_qf_filter_data = Div(NotStr(table_html_qf_filter_data), style="margin: 40px;")
179
 
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  dolma311 = """
182
  words = text.split()
183
  word_count = len(words)