Spaces:
Runtime error
Runtime error
update app
Browse files- app.py +35 -21
- requirements.txt +1 -1
app.py
CHANGED
@@ -5,9 +5,11 @@ from collections import OrderedDict, defaultdict
|
|
5 |
import diff_viewer
|
6 |
import pandas as pd
|
7 |
import streamlit as st
|
8 |
-
from datasets import
|
|
|
|
|
|
|
9 |
|
10 |
-
DATASET_DIR_PATH_BEFORE_CLEAN_SELECT = os.getenv("DATASET_DIR_PATH_BEFORE_CLEAN_SELECT")
|
11 |
OPERATION_TYPES = [
|
12 |
"Applied filter",
|
13 |
"Applied deduplication function",
|
@@ -16,8 +18,8 @@ OPERATION_TYPES = [
|
|
16 |
MAX_LEN_DS_CHECKS = os.getenv("MAX_LEN_DS_CHECKS")
|
17 |
|
18 |
|
19 |
-
def get_ds(
|
20 |
-
ds =
|
21 |
return ds
|
22 |
|
23 |
|
@@ -41,11 +43,11 @@ def on_click_previous():
|
|
41 |
st.session_state["idx_2"] = previous_idx(st.session_state["idx_2"])
|
42 |
|
43 |
|
44 |
-
def on_ds_change(
|
45 |
-
st.session_state["ds"] = get_ds(
|
46 |
st.session_state["idx_1"] = 0
|
47 |
st.session_state["idx_2"] = 1 if len(st.session_state["ds"]) > 1 else 0
|
48 |
-
st.session_state["
|
49 |
st.session_state["ds_max_docs"] = len(st.session_state["ds"])
|
50 |
|
51 |
|
@@ -128,10 +130,7 @@ def get_log_stats_df(raw_log):
|
|
128 |
return df
|
129 |
|
130 |
|
131 |
-
def get_logs_stats(
|
132 |
-
with open(log_path) as f:
|
133 |
-
raw_log = f.read()
|
134 |
-
|
135 |
try:
|
136 |
df = get_log_stats_df(raw_log)
|
137 |
st.dataframe(df)
|
@@ -263,26 +262,41 @@ st.write(
|
|
263 |
)
|
264 |
col_option_clean, col_option_ds = st.columns(2)
|
265 |
|
266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
option_clean = col_option_clean.selectbox(
|
268 |
"Select the cleaning version", CLEANING_VERSIONS
|
269 |
)
|
270 |
|
271 |
-
DATASET_DIR_PATH = os.path.join(DATASET_DIR_PATH_BEFORE_CLEAN_SELECT, option_clean)
|
272 |
-
dataset_names = sorted(list(os.listdir(DATASET_DIR_PATH)))
|
273 |
option_ds = col_option_ds.selectbox("Select the dataset", dataset_names)
|
274 |
|
275 |
-
checks_path = os.path.join(DATASET_DIR_PATH, option_ds, "checks")
|
276 |
-
checks_names = sorted(list(os.listdir(checks_path)))
|
277 |
|
278 |
-
log_path = os.path.join(DATASET_DIR_PATH, option_ds, "logs.txt")
|
279 |
-
|
|
|
|
|
280 |
|
281 |
option_check = st.selectbox("Select the operation applied to inspect", checks_names)
|
282 |
-
ds_path = os.path.join(checks_path, option_check)
|
283 |
|
284 |
-
|
285 |
-
|
|
|
|
|
286 |
|
287 |
if len(st.session_state["ds"]) == MAX_LEN_DS_CHECKS:
|
288 |
st.warning(
|
|
|
5 |
import diff_viewer
|
6 |
import pandas as pd
|
7 |
import streamlit as st
|
8 |
+
from datasets import load_dataset, get_dataset_config_names
|
9 |
+
|
10 |
+
CHECK_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT = os.getenv("CHECK_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT")
|
11 |
+
LOGS_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT = os.getenv("LOGS_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT")
|
12 |
|
|
|
13 |
OPERATION_TYPES = [
|
14 |
"Applied filter",
|
15 |
"Applied deduplication function",
|
|
|
18 |
MAX_LEN_DS_CHECKS = os.getenv("MAX_LEN_DS_CHECKS")
|
19 |
|
20 |
|
21 |
+
def get_ds(config):
|
22 |
+
ds = load_dataset(CHECK_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT, config)
|
23 |
return ds
|
24 |
|
25 |
|
|
|
43 |
st.session_state["idx_2"] = previous_idx(st.session_state["idx_2"])
|
44 |
|
45 |
|
46 |
+
def on_ds_change(config):
|
47 |
+
st.session_state["ds"] = get_ds(config)
|
48 |
st.session_state["idx_1"] = 0
|
49 |
st.session_state["idx_2"] = 1 if len(st.session_state["ds"]) > 1 else 0
|
50 |
+
st.session_state["ds_check_config"] = config
|
51 |
st.session_state["ds_max_docs"] = len(st.session_state["ds"])
|
52 |
|
53 |
|
|
|
130 |
return df
|
131 |
|
132 |
|
133 |
+
def get_logs_stats(raw_log):
|
|
|
|
|
|
|
134 |
try:
|
135 |
df = get_log_stats_df(raw_log)
|
136 |
st.dataframe(df)
|
|
|
262 |
)
|
263 |
col_option_clean, col_option_ds = st.columns(2)
|
264 |
|
265 |
+
CHECK_CONFIGS = get_dataset_config_names(CHECK_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT)
|
266 |
+
|
267 |
+
CLEANING_VERSIONS = set()
|
268 |
+
dataset_names = set()
|
269 |
+
checks_names = set()
|
270 |
+
for check_config in CHECK_CONFIGS:
|
271 |
+
cleaning_version, check_config = check_config.split("_dsname_")
|
272 |
+
dataset_name, checks_name = check_config.split("_operation_")
|
273 |
+
CLEANING_VERSIONS.add(cleaning_version)
|
274 |
+
dataset_names.add(dataset_name)
|
275 |
+
checks_names.add(checks_names)
|
276 |
+
|
277 |
+
# CLEANING_VERSIONS = sorted(list(os.listdir(DATASET_DIR_PATH_BEFORE_CLEAN_SELECT)), reverse=True)
|
278 |
option_clean = col_option_clean.selectbox(
|
279 |
"Select the cleaning version", CLEANING_VERSIONS
|
280 |
)
|
281 |
|
282 |
+
# DATASET_DIR_PATH = os.path.join(DATASET_DIR_PATH_BEFORE_CLEAN_SELECT, option_clean)
|
283 |
+
# dataset_names = sorted(list(os.listdir(DATASET_DIR_PATH)))
|
284 |
option_ds = col_option_ds.selectbox("Select the dataset", dataset_names)
|
285 |
|
286 |
+
# checks_path = os.path.join(DATASET_DIR_PATH, option_ds, "checks")
|
287 |
+
# checks_names = sorted(list(os.listdir(checks_path)))
|
288 |
|
289 |
+
# log_path = os.path.join(DATASET_DIR_PATH, option_ds, "logs.txt")
|
290 |
+
ds_log = load_dataset(LOGS_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT, f"{option_clean}_dsname_{option_ds}")
|
291 |
+
log = ds_log["train"][0]
|
292 |
+
get_logs_stats(log=log)
|
293 |
|
294 |
option_check = st.selectbox("Select the operation applied to inspect", checks_names)
|
|
|
295 |
|
296 |
+
ds_check_config = f"{option_clean}_dsname_{option_ds}_operation_{option_check}"
|
297 |
+
|
298 |
+
if "ds" not in st.session_state or ds_check_config != st.session_state["ds_check_config"]:
|
299 |
+
on_ds_change(ds_check_config)
|
300 |
|
301 |
if len(st.session_state["ds"]) == MAX_LEN_DS_CHECKS:
|
302 |
st.warning(
|
requirements.txt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
-
datasets==
|
2 |
pandas==1.3.5
|
3 |
streamlit_diff_viewer==0.0.2
|
|
|
1 |
+
datasets==2.3.2
|
2 |
pandas==1.3.5
|
3 |
streamlit_diff_viewer==0.0.2
|