Spaces:

mikeee
/

radiobee-dev

Runtime error

App Files Files Community

freemt commited on Jan 20, 2022

Commit

a15cd26

•

1 Parent(s): 5a186f5

Update sent-ali fast and slow

Browse files

Files changed (9) hide show

img/plt.png +0 -0
radiobee/__main__.py +53 -6
radiobee/align_sents.py +0 -72
radiobee/align_sents.pyc +0 -0
radiobee/error_msg.py +2 -2
radiobee/gradiobee.py +68 -5
radiobee/paras2sents.py +110 -0
requirements.txt +1 -0
tests/test_paras2sents.py +8 -2

img/plt.png CHANGED Viewed

radiobee/__main__.py CHANGED Viewed

@@ -139,7 +139,6 @@ if __name__ == "__main__":
         gr.inputs.File(label="file 2", optional=True),
     ]
-    # modi 1
     _ = """
         tf_type: Literal[linear, sqrt, log, binary] = 'linear'
         idf_type: Optional[Literal[standard, smooth, bm25]] = None
@@ -159,10 +158,13 @@ if __name__ == "__main__":
     )  # ditto
     input_norm_type = gr.inputs.Radio(["None", "l1", "l2"], default="None")  # ditto
-    inputs = [
         gr.inputs.File(label="file 1"),
         gr.inputs.File(label="file 2", optional=True),
-        input_tf_type,  # modi inputs
         input_idf_type,
         input_dl_type,
         input_norm_type,
@@ -178,6 +180,7 @@ if __name__ == "__main__":
             step=1,
             default=6,
         ),
     ]
     examples = [
@@ -190,6 +193,29 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/test_en.txt",
@@ -200,6 +226,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/shakespeare_zh500.txt",
@@ -210,6 +237,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/shakespeare_en500.txt",
@@ -220,6 +248,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/hlm-ch1-zh.txt",
@@ -230,6 +259,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/hlm-ch1-en.txt",
@@ -240,6 +270,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/ps-cn.txt",
@@ -250,6 +281,7 @@ if __name__ == "__main__":
             "None",
             10,
             4,
         ],
         [
             "data/test-dual.txt",
@@ -260,6 +292,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/英译中国现代散文选1(汉外对照丛书).txt",
@@ -270,6 +303,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/test-zh-ja.txt",
@@ -280,6 +314,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/xiyouji-ch1-zh.txt",
@@ -290,6 +325,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/demian-hesse-de.txt",
@@ -300,6 +336,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
         [
             "data/catcher-in-the-rye-shixianrong-zh.txt",
@@ -310,6 +347,7 @@ if __name__ == "__main__":
             "None",
             10,
             6,
         ],
     ]
@@ -340,14 +378,23 @@ if __name__ == "__main__":
     out_file_dl_excel = gr.outputs.File(
         label="Click to download xlsx",
     )
-    # modi outputs
-    outputs = [
         out_df,
-        # "plot",
         gr.outputs.Image(label="plot"),
         out_file_dl,
         out_file_dl_excel,
         out_df_aligned,
         gr.outputs.HTML(),
     ]

         gr.inputs.File(label="file 2", optional=True),
     ]
     _ = """
         tf_type: Literal[linear, sqrt, log, binary] = 'linear'
         idf_type: Optional[Literal[standard, smooth, bm25]] = None
     )  # ditto
     input_norm_type = gr.inputs.Radio(["None", "l1", "l2"], default="None")  # ditto
+    # modi inputs 1, definitions
+    sent_ali_algo = gr.inputs.Radio(["None", "fast", "slow"], default="None")
+    inputs = [  # tot. 9, meed to modify input of gradio & examples
         gr.inputs.File(label="file 1"),
         gr.inputs.File(label="file 2", optional=True),
+        input_tf_type,  # modi inputs 2
         input_idf_type,
         input_dl_type,
         input_norm_type,
             step=1,
             default=6,
         ),
+        sent_ali_algo,
     ]
     examples = [
             "None",
             10,
             6,
+            "None",
+        ],
+        [
+            "data/test_zh.txt",
+            "data/test_en.txt",
+            "linear",
+            "None",
+            "None",
+            "None",
+            10,
+            6,
+            "fast",
+        ],
+        [
+            "data/test_zh.txt",
+            "data/test_en.txt",
+            "linear",
+            "None",
+            "None",
+            "None",
+            10,
+            6,
+            "slow",
         ],
         [
             "data/test_en.txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/shakespeare_zh500.txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/shakespeare_en500.txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/hlm-ch1-zh.txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/hlm-ch1-en.txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/ps-cn.txt",
             "None",
             10,
             4,
+            "None",
         ],
         [
             "data/test-dual.txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/英译中国现代散文选1(汉外对照丛书).txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/test-zh-ja.txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/xiyouji-ch1-zh.txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/demian-hesse-de.txt",
             "None",
             10,
             6,
+            "None",
         ],
         [
             "data/catcher-in-the-rye-shixianrong-zh.txt",
             "None",
             10,
             6,
+            "None",
         ],
     ]
     out_file_dl_excel = gr.outputs.File(
         label="Click to download xlsx",
     )
+    out_sents_dl = gr.outputs.File(
+        label="Click to download sents csv",
+    )
+    out_sents_dl_excel = gr.outputs.File(
+        label="Click to download sents xlsx",
+    )
+    # modi outputs 1, definitions
+    # modi outputs 2, need to modify gradio error_msg
+    outputs = [  # tot. 8
         out_df,
         gr.outputs.Image(label="plot"),
         out_file_dl,
         out_file_dl_excel,
+        out_sents_dl,
+        out_sents_dl_excel,
         out_df_aligned,
         gr.outputs.HTML(),
     ]

radiobee/align_sents.py DELETED Viewed

@@ -1,72 +0,0 @@
-"""Align sents via gale-church."""
-# pylint: disable=invalid-name
-from typing import List, Tuple  # noqa
-import re
-# from itertools import tee
-# from more_itertools import ilen
-from nltk.translate.gale_church import align_blocks
-from radiobee.amend_avec import amend_avec
-def align_sents(lst1: List[str], lst2: List[str]) -> List[Tuple[str, str]]:
-    """Align sents.
-    >>> lst1, lst2 = ['a', 'bs',], ['aaa', '34', 'a', 'b']
-    """
-    if isinstance(lst1, str):
-        lst1 = [lst1]
-    if isinstance(lst2, str):
-        lst2 = [lst2]
-    src_blocks = [len(re.sub(r"\s+", "", elm)) for elm in lst1]
-    tgt_blocks = [len(re.sub(r"\s+", "", elm)) for elm in lst2]
-    avec = align_blocks(src_blocks, tgt_blocks)
-    len1, len2 = len(lst1), len(lst2)
-    # lst1, _ = tee(lst1)
-    # len1 = ilen(_)
-    # lst2, _ = tee(lst2)
-    # len2 = ilen(_)
-    amended_avec = amend_avec(avec, len1, len2)
-    texts = []
-    # for elm in aset:
-    # for elm0, elm1 in amended_avec:
-    for elm in amended_avec:
-        # elm0, elm1, elm2 = elm
-        elm0, elm1 = elm[:2]
-        _ = []
-        # src_text first
-        if isinstance(elm0, str):
-            _.append("")
-        else:
-            # _.append(src_text[int(elm0)])
-            _.append(lst1[int(elm0)])
-        if isinstance(elm1, str):
-            _.append("")
-        else:
-            # _.append(tgt_text[int(elm0)])
-            _.append(lst2[int(elm1)])
-        _a = """
-        if isinstance(elm2, str):
-            _.append("")
-        else:
-            _.append(round(elm2, 2))
-        # """
-        del _a
-        texts.append(tuple(_))
-    return texts
-    # return ["", ""]

radiobee/align_sents.pyc ADDED Viewed

Binary file (1.55 kB). View file

radiobee/error_msg.py CHANGED Viewed

@@ -8,7 +8,7 @@ import pandas as pd
 def error_msg(
     msg: Optional[Union[str, Exception]],
     title: str = "error message",
-) -> Tuple[Union[pd.DataFrame, None], None, None, None, None, None]:
     """Prepare an error message for gradiobee outputs."""
     if msg is None:
         msg = "none..."
@@ -21,4 +21,4 @@ def error_msg(
     df = pd.DataFrame([msg], columns=[title])
     # return df, *((None,) * 4)  # pyright complains
-    return df, None, None, None, None, None

 def error_msg(
     msg: Optional[Union[str, Exception]],
     title: str = "error message",
+) -> Tuple[Union[pd.DataFrame, None], None, None, None, None, None, None, None]:
     """Prepare an error message for gradiobee outputs."""
     if msg is None:
         msg = "none..."
     df = pd.DataFrame([msg], columns=[title])
     # return df, *((None,) * 4)  # pyright complains
+    return df, None, None, None, None, None, None, None

radiobee/gradiobee.py CHANGED Viewed

@@ -30,6 +30,10 @@ from radiobee.trim_df import trim_df
 from radiobee.error_msg import error_msg
 from radiobee.text2lists import text2lists
 uname = platform.uname()
 HFSPACES = False
 if "amzn2" in uname.release:  # on hf spaces
@@ -43,7 +47,7 @@ debug = False
 debug = True
-def gradiobee(
     file1,
     file2,
     tf_type,
@@ -53,6 +57,7 @@ def gradiobee(
     eps,
     min_samples,
     # debug=False,
 ):
     """Process inputs and return outputs."""
     logger.debug(" *debug* ")
@@ -382,7 +387,7 @@ def gradiobee(
     df_aligned = df_aligned[["text2", "text1", "likelihood"]]
     df_aligned.columns = ["text1", "text2", "likelihood"]
-    ic(df_aligned.head())
     # round the last column to 2
     # df_aligned.likelihood = df_aligned.likelihood.round(2)
@@ -434,8 +439,66 @@ def gradiobee(
     # return df_trimmed, output_plot, file_dl, file_dl_xlsx, df_aligned
     # return df_trimmed, output_plot, file_dl, file_dl_xlsx, styled, df_html  # gradio cant handle style
-    ic("returning outputs")
-    return df_trimmed, output_plot, file_dl, file_dl_xlsx, df_aligned, df_html
-    # modi outputs

 from radiobee.error_msg import error_msg
 from radiobee.text2lists import text2lists
+from radiobee.align_sents import align_sents
+from radiobee.shuffle_sents import shuffle_sents  # type: ignore
+from radiobee.paras2sents import paras2sents  # type: ignore
 uname = platform.uname()
 HFSPACES = False
 if "amzn2" in uname.release:  # on hf spaces
 debug = True
+def gradiobee(  # noqa
     file1,
     file2,
     tf_type,
     eps,
     min_samples,
     # debug=False,
+    sent_ali_algo,
 ):
     """Process inputs and return outputs."""
     logger.debug(" *debug* ")
     df_aligned = df_aligned[["text2", "text1", "likelihood"]]
     df_aligned.columns = ["text1", "text2", "likelihood"]
+    ic("paras aligned: ", df_aligned.head(10))
     # round the last column to 2
     # df_aligned.likelihood = df_aligned.likelihood.round(2)
     # return df_trimmed, output_plot, file_dl, file_dl_xlsx, df_aligned
     # return df_trimmed, output_plot, file_dl, file_dl_xlsx, styled, df_html  # gradio cant handle style
+    ic("sent-ali-algo: ", sent_ali_algo)
+    # ### sent-ali-algo is None: para align
+    if sent_ali_algo in ["None"]:
+        ic("returning para-ali outputs")
+        return df_trimmed, output_plot, file_dl, file_dl_xlsx, None, None, df_aligned, df_html
+    # ### proceed with sent align
+    if sent_ali_algo in ["fast"]:
+        ic(sent_ali_algo)
+        align_func = align_sents
+        ic(df_aligned.shape, df_aligned.columns)
+        aligned_sents = paras2sents(df_aligned, align_func)
+        # ic(pd.DataFrame(aligned_sents).shape, aligned_sents)
+        ic(pd.DataFrame(aligned_sents).shape)
+        df_aligned_sents = pd.DataFrame(aligned_sents, columns=["text1", "text2"])
+    else:  # ["slow"]
+        ic(sent_ali_algo)
+        align_func = shuffle_sents
+        aligned_sents = paras2sents(df_aligned, align_func, lang1, lang2)
+        # add extra entry if necessary
+        aligned_sents = [list(sent) + [""] if len(sent) == 2 else list(sent) for sent in aligned_sents]
+        df_aligned_sents = pd.DataFrame(aligned_sents, columns=["text1", "text2", "likelihood"])
+    # prepare sents downloads
+    file_dl_sents = Path(f"{file_dl.stem}-sents{file_dl.suffix}")
+    file_dl_xlsx_sents = Path(f"{file_dl_xlsx.stem}-sents{file_dl_xlsx.suffix}")
+    _ = df_aligned_sents.to_csv(index=False)
+    file_dl_sents.write_text(_, encoding="utf8")
+    df_aligned_sents.to_excel(file_dl_xlsx_sents)
+    # prepare html output
+    if len(df_aligned_sents) > 200:
+        df_html = None
+    else:  # show a one-bathc table in html
+        # style
+        styled = df_aligned_sents.style.set_properties(
+            **{
+                "font-size": "10pt",
+                "border-color": "black",
+                "border": "1px black solid !important"
+            }
+            # border-color="black",
+        ).set_table_styles([{
+            "selector": "",  # noqs
+            "props": [("border", "2px black solid !important")]}]  # noqs
+        ).format(
+            precision=2
+        )
+        df_html = styled.to_html()
+    # aligned sents outputs
+    ic("aligned sents outputs")
+    # return df_trimmed, output_plot, file_dl, file_dl_xlsx, None, None, df_aligned, df_html
+    return df_trimmed, output_plot, file_dl, file_dl_xlsx, file_dl_sents, file_dl_xlsx_sents, df_aligned_sents, df_html

radiobee/paras2sents.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""Convert paras to sents."""
+# pylint: disable=unused-import, too-many-branches, ungrouped-imports
+from typing import Callable, List, Optional, Tuple, Union
+from itertools import zip_longest
+import numpy as np
+import pandas as pd
+from logzero import logger
+from radiobee.align_sents import align_sents
+from radiobee.seg_text import seg_text
+from radiobee.detect import detect
+try:
+    from radiobee.shuffle_sents import shuffle_sents
+except Exception as exc:
+    logger.error("shuffle_sents not available: %s, using align_sents", exc)
+    shuffle_sents = lambda x1, x2, lang1="", lang2="": align_sents(x1, x2)  # noqa
+def paras2sents(
+    paras_: Union[pd.DataFrame, List[Tuple[str, str, Union[str, float]]], np.ndarray],
+    align_func: Optional[Union[Callable, str]] = None,
+    lang1: Optional[str] = None,
+    lang2: Optional[str] = None,
+) -> List[Tuple[str, str, Union[str, float]]]:
+    """Convert paras to sents using align_func.
+    Args:
+        paras_: list of 3-tuples or numpy or pd.DataFrame
+        lang1: fisrt lang code
+        lang2: second lang code
+        align_func: func used in the sent level
+            if set to None, default to align_sents
+    Returns:
+        list of sents (possible with likelihood for shuffle_sents)
+    """
+    # wrap everything in pd.DataFrame
+    # necessary to make pyright happy
+    paras = pd.DataFrame(paras_).fillna("")
+    # take the first three columns at maximum
+    paras = paras.iloc[:, :3]
+    if len(paras.columns) < 2:
+        logger.error(
+            "Need at least two columns, got %s",
+            len(paras.columns)
+        )
+        raise Exception("wrong data")
+    # append the third col (all "") if there are only two cols
+    if len(paras.columns) < 3:
+        paras.insert(2, "likelihood", [""] * len(paras))
+    if lang1 is None:
+        lang1 = detect(" ".join(paras.iloc[:, 0]))
+    if lang2 is None:
+        lang2 = detect(" ".join(paras.iloc[:, 1]))
+    left, right = [], []
+    row0, row1 = [], []
+    for elm0, elm1, elm2 in paras.values:
+        sents0 = seg_text(elm0, lang1)
+        sents1 = seg_text(elm1, lang2)
+        if isinstance(elm2, float) and elm2 > 0:
+            if row0 or row1:
+                left.append(row0)
+                right.append(row1)
+            row0, row1 = [], []  # collect and prepare
+            if sents0:
+                left.append(sents0)
+            if sents1:
+                right.append(sents1)
+        else:
+            if sents0:
+                row0.extend(sents0)
+            if sents1:
+                row1.extend(sents1)
+    # collect possible last batch
+    if row0 or row1:
+        left.append(row0)
+        right.append(row1)
+    # res = [*zip(left, right)]
+    # align each batch using align_func
+    # ready align_func
+    if align_func is None:
+        align_func = align_sents
+    if isinstance(align_func, str) and align_func.startswith("shuffle") or not isinstance(align_func, str) and align_func.__name__ in ["shuffle_sents"]:
+        align_func = lambda row0, row1: shuffle_sents(row0, row1, lang1=lang1, lang2=lang2)  # noqa
+    else:
+        align_func = align_sents
+    res = []
+    for row0, row1 in zip(left, right):
+        try:
+            _ = align_func(row0, row1)
+        except Exception as exc:
+            logger.info("probably empty para supplied: %s, resorting to zip_longest", exc)
+            _ = [*zip_longest(row0, row1, fillvalue="")]
+        # res.append(_)
+        res.extend(_)
+    return res

requirements.txt CHANGED Viewed

@@ -23,6 +23,7 @@ pyicu
 pycld2
 tqdm
 polyglot
 sentence_splitter
 icecream
 # lazy

 pycld2
 tqdm
 polyglot
+nltk
 sentence_splitter
 icecream
 # lazy

tests/test_paras2sents.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Test paras2sents."""
 # pylint: disable=invalid-name
 import pandas as pd
 from radiobee.paras2sents import paras2sents
 from radiobee.shuffle_sents import shuffle_sents
@@ -14,15 +15,20 @@ def test_paras2sents_dual():
     """Test paras2sents_dual."""
     sents = paras2sents(paras)
     assert len(sents) > 202  # 208
     # assert not sents
 def test_paras2sents_dual_model_s():
     """Test paras2sents_dual_model_s."""
-    sents = paras2sents(paras, shuffle_sents)
-    assert len(sents) > 201  # 207
     # assert not sents

 """Test paras2sents."""
 # pylint: disable=invalid-name
+import numpy as np
 import pandas as pd
 from radiobee.paras2sents import paras2sents
 from radiobee.shuffle_sents import shuffle_sents
     """Test paras2sents_dual."""
     sents = paras2sents(paras)
+    assert np.array(sents).shape.__len__() > 1
     assert len(sents) > 202  # 208
     # assert not sents
 def test_paras2sents_dual_model_s():
     """Test paras2sents_dual_model_s."""
+    sents1 = paras2sents(paras, shuffle_sents)
+    # assert np.array(sents1).shape.__len__() > 1
+    assert pd.DataFrame(sents1).shape.__len__() > 1
+    assert len(sents1) > 201  # 207
     # assert not sents