"""Fetch content from upload.
org ezbee_page.py.
"""
# pylint: disable=invalid-name
# pylint: disable=too-many-locals, too-many-return-statements, too-many-branches, too-many-statements, abstract-class-instantiated
import base64
import platform
import inspect
import io
# pylint: disable=invalid-name
from functools import partial
from itertools import zip_longest
# import hanzidentifier
import logzero
import numpy as np
import pandas as pd
import pendulum
import streamlit as st
from about_time import about_time
# from ezbee.gen_pairs import gen_pairs # aset2pairs?
from aset2pairs import aset2pairs
from icecream import ic
from loguru import logger as loggu
from logzero import logger
from seg_text import seg_text
from set_loglevel import set_loglevel
from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode
# from st_aggrid.grid_options_builder import GridOptionsBuilder
from streamlit import session_state as state
from st_mlbee.color_map import color_map
from st_mlbee.fetch_paste import fetch_paste
from st_mlbee.fetch_upload import fetch_upload
from st_mlbee.fetch_urls import fetch_urls
# from st_mlbee.t2s import t2s
from st_mlbee import st_mlbee
def home(): # noqa
"""Run tasks.
beetype
sourcetype
fetch_upload/fetch_paste, fetch_url
sourcecount
align: para-align/sent-align
save xlsx/tsv
"""
if state.ns.sourcetype not in ["upload", "paste", "urls"]:
st.write("Coming soooooooon...")
return None
# if state.ns.beetype not in ["ezbee", "dzbee", "debee"]:
if state.ns.beetype not in ["mlbee", ]:
st.write("Coming soon...")
return None
# process sourcetype and fetch list1/list2
list1 = list2 = []
# fetch_upload/fetch_paste
if state.ns.sourcetype in ["upload"]:
fetch_upload()
elif state.ns.sourcetype in ["paste"]:
fetch_paste()
elif state.ns.sourcetype in ["urls"]:
fetch_urls()
else:
st.warning(f"{state.ns.sourcetype}: Not implemented")
return None
# state.ns.list1 state.ns.list2 defiend in fetch_x
if state.ns.sentali: # split to sents
try:
state.ns.list1 = seg_text(state.ns.list1)
except Exception as exc:
logger.exception(exc)
raise
try:
state.ns.list2 = seg_text(state.ns.list2)
except Exception as exc:
logger.exception(exc)
raise
logger.debug("state.ns.updated: %s", state.ns.updated)
# if not updated, quit: this does not quite work
# only prevents the first run/missing upload
if not state.ns.updated:
logger.debug(" not updated, early exit.")
return None
list1 = state.ns.list1[:]
list2 = state.ns.list2[:]
logger.debug("list1[:3]: %s", list1[:3])
logger.debug("list2[:3]: %s", list2[:3])
df = pd.DataFrame(zip_longest(list1, list2, fillvalue=""))
try:
# df.columns = ["text1", "text2"]
df.columns = [f"text{i + 1}" for i in range(len(df.columns))]
except Exception as exc:
logger.debug("df: \n%s", df)
logger.error("%s", exc)
state.ns.df = df
logger.debug("df: %s", df)
# st.table(df) # looks alright
# equiv to st.markdown(df.to_markdown())?
# stlyed pd dataframe?
# bigger, no pagination
# st.markdown(df.to_html(), unsafe_allow_html=True)
# ag_grid smallish, editable, probably slower
# if "df" not in globals() or "df" not in locals():
if "df" not in locals():
logger.debug(" df not defined, return")
if df.empty:
logger.debug(" df.empty, return")
return None
# print estimated completion time
len1 = len([elm.strip() for elm in list1 if elm.strip()])
len2 = len([elm.strip() for elm in list2 if elm.strip()])
len12 = len1 + len2
time_min = 0.4
time_max = 1
time_av = .66
uname = platform.uname()
if "amz2" in uname.release or "forindo" in uname.node:
time_min /= 12
time_max /= 12
time_av /= 12
# reduce for sent align
if state.ns.sentali:
time_min /= 1.4
time_max /= 1.4
time_av /= 1.4
# time0 = len12 * 0.4
# time1 = len12 * 1
# eta = pendulum.now() + pendulum.duration(seconds=len12 * 0.66)
time0 = len12 * time_min
time1 = len12 * time_max
eta = pendulum.now() + pendulum.duration(seconds=len12 * time_av)
in_words0 = pendulum.duration(seconds=time0).in_words()
in_words1 = pendulum.duration(seconds=time1).in_words()
diff_for_humans = eta.diff_for_humans()
dt_str = eta.to_datetime_string()
timezone_name = eta.timezone_name
_ = (
f"running in {uname.node} -- "
f" processing {len1} + {len2} = {len12} blocks; "
f"estimated time to complete: {in_words0} to {in_words1}; "
f"eta: {diff_for_humans} ({dt_str} {timezone_name}) "
)
eta_msg = _
# st.info(_)
# only show this for upload
if state.ns.sourcetype in ["upload"]:
_ = st.expander("to be aligned", expanded=False)
with _:
st.write(df)
logger.info("Processing data... %s", state.ns.beetype)
# if state.ns.beetype in ["ezbee", "dzbee", "debee"]:
if state.ns.beetype in ["mlbee"]:
with about_time() as t:
# diggin...
with st.spinner(f"{eta_msg}"):
try:
# aset = globals()[state.ns.beetype](
aset = st_mlbee(
list1,
list2,
# eps=eps,
# min_samples=min_samples,
)
except Exception as e:
logger.exception(
"aset = globals()[state.ns.beetype](...) exc: %s", e
)
aset = ""
st.write("Collecting inputs...")
logger.debug("Collecting inputs...")
return None
st.success(f"Done, took {t.duration_human}")
else:
try:
filename = inspect.currentframe().f_code.co_filename # type: ignore
except Exception as e:
logger.error(e)
filename = ""
try:
lineno = inspect.currentframe().f_lineno # type: ignore
except Exception as e:
logger.error(e)
lineno = ""
st.write(f"{state.ns.beetype} coming soon...{filename}:{lineno}")
return None
if aset:
logger.debug("aset: %s...%s", aset[:3], aset[-3:])
# logger.debug("aset[:10]: %s", aset[:10])
if set_loglevel() <= 10:
st.write(aset)
# aligned_pairs = gen_pairs(list1, list2, aset)
aligned_pairs = aset2pairs(list1, list2, aset)
if aligned_pairs:
# logger.debug("%s...%s", aligned_pairs[:1], aligned_pairs[-1:])
logger.debug("%s...s", aligned_pairs[:1])
df_a = pd.DataFrame(
aligned_pairs, columns=["text1", "text2", "llh"], dtype="object"
)
if set_loglevel() <= 10:
_ = st.expander("done aligned")
with _:
st.table(df_a.astype(str))
# st.markdown(df_a.astype(str).to_markdown())
# st.markdown(df_a.astype(str).to_numpy().tolist())
# insert seq no
df_a.insert(0, "sn", range(len(df_a)))
gb = GridOptionsBuilder.from_dataframe(df_a)
gb.configure_pagination(paginationAutoPageSize=True)
options = {
"resizable": True,
"autoHeight": True,
"wrapText": True,
"editable": True,
}
gb.configure_default_column(**options)
gridOptions = gb.build()
# st.write("editable aligned (double-click a cell to edit, drag column header to adjust widths)")
_ = "editable aligned (double-click a cell to edit, drag column header to adjust widths)"
with st.expander(_, expanded=False):
ag_df = AgGrid(
# df,
df_a,
gridOptions=gridOptions,
key="outside",
reload_data=True,
editable=True,
# width="100%", # width parameter is deprecated
height=750,
# fit_columns_on_grid_load=True,
update_mode=GridUpdateMode.MODEL_CHANGED,
)
# pop("sn"): remove sn column
df_a.pop("sn")
# ### prep download ### #
# taken from vizbee cb_save_xlsx
# subset = list(df_a.columns[2:3]) # 3rd col
subset = list(df_a.columns[2:]) # 3rd col
s_df = df_a.astype(str).style.applymap(color_map, subset=subset)
if set_loglevel() <= 10:
logger.debug(" showing styled aligned")
with st.expander("styled aligned"):
# st.dataframe(s_df) # can't handle styleddf
st.table(s_df)
output = io.BytesIO()
with pd.ExcelWriter(
output, engine="xlsxwriter"
) as writer: # pylint: disable=abstract-class-instantiated
s_df.to_excel(writer, index=False, header=False, sheet_name="Sheet1")
writer.sheets["Sheet1"].set_column("A:A", 70)
writer.sheets["Sheet1"].set_column("B:B", 70)
output.seek(0)
val = output.getvalue()
b64 = base64.b64encode(val)
filename = ""
if state.ns.src_filename:
filename = f"{state.ns.src_filename}-"
if state.ns.sentali:
extra = "aligned_sents"
else:
extra = "aligned_paras"
dl_xlsx = f'Download aligned paras xlsx'
_ = """
output = io.BytesIO()
# df_a.astype(str).to_csv(output, sep="\t", index=False, header=False, encoding="gbk")
df_a.astype(object).to_csv(output, sep="\t", index=False, header=False, encoding="gbk")
output.seek(0)
val = output.getvalue()
b64 = base64.b64encode(val)
dl_tsv = f'Download aligned paras tsv'
# """
col1_dl, col2_dl = st.columns(2)
with col1_dl:
st.markdown(dl_xlsx, unsafe_allow_html=True)
_ = """
with col2_dl:
st.markdown(dl_tsv, unsafe_allow_html=True)
# """
# reset
state.ns.updated = False
return None