freemt commited on
Commit
06e32d2
1 Parent(s): a15cd26

Update sent-ali fast and slow

Browse files
Files changed (1) hide show
  1. radiobee/paras2sents.py +0 -110
radiobee/paras2sents.py DELETED
@@ -1,110 +0,0 @@
1
- """Convert paras to sents."""
2
- # pylint: disable=unused-import, too-many-branches, ungrouped-imports
3
-
4
- from typing import Callable, List, Optional, Tuple, Union
5
-
6
- from itertools import zip_longest
7
- import numpy as np
8
- import pandas as pd
9
- from logzero import logger
10
-
11
- from radiobee.align_sents import align_sents
12
- from radiobee.seg_text import seg_text
13
- from radiobee.detect import detect
14
-
15
- try:
16
- from radiobee.shuffle_sents import shuffle_sents
17
- except Exception as exc:
18
- logger.error("shuffle_sents not available: %s, using align_sents", exc)
19
- shuffle_sents = lambda x1, x2, lang1="", lang2="": align_sents(x1, x2) # noqa
20
-
21
-
22
- def paras2sents(
23
- paras_: Union[pd.DataFrame, List[Tuple[str, str, Union[str, float]]], np.ndarray],
24
- align_func: Optional[Union[Callable, str]] = None,
25
- lang1: Optional[str] = None,
26
- lang2: Optional[str] = None,
27
- ) -> List[Tuple[str, str, Union[str, float]]]:
28
- """Convert paras to sents using align_func.
29
-
30
- Args:
31
- paras_: list of 3-tuples or numpy or pd.DataFrame
32
- lang1: fisrt lang code
33
- lang2: second lang code
34
- align_func: func used in the sent level
35
- if set to None, default to align_sents
36
- Returns:
37
- list of sents (possible with likelihood for shuffle_sents)
38
- """
39
- # wrap everything in pd.DataFrame
40
- # necessary to make pyright happy
41
- paras = pd.DataFrame(paras_).fillna("")
42
-
43
- # take the first three columns at maximum
44
- paras = paras.iloc[:, :3]
45
-
46
- if len(paras.columns) < 2:
47
- logger.error(
48
- "Need at least two columns, got %s",
49
- len(paras.columns)
50
- )
51
- raise Exception("wrong data")
52
-
53
- # append the third col (all "") if there are only two cols
54
- if len(paras.columns) < 3:
55
- paras.insert(2, "likelihood", [""] * len(paras))
56
-
57
- if lang1 is None:
58
- lang1 = detect(" ".join(paras.iloc[:, 0]))
59
- if lang2 is None:
60
- lang2 = detect(" ".join(paras.iloc[:, 1]))
61
-
62
- left, right = [], []
63
- row0, row1 = [], []
64
- for elm0, elm1, elm2 in paras.values:
65
- sents0 = seg_text(elm0, lang1)
66
- sents1 = seg_text(elm1, lang2)
67
- if isinstance(elm2, float) and elm2 > 0:
68
- if row0 or row1:
69
- left.append(row0)
70
- right.append(row1)
71
- row0, row1 = [], [] # collect and prepare
72
-
73
- if sents0:
74
- left.append(sents0)
75
- if sents1:
76
- right.append(sents1)
77
- else:
78
- if sents0:
79
- row0.extend(sents0)
80
- if sents1:
81
- row1.extend(sents1)
82
- # collect possible last batch
83
- if row0 or row1:
84
- left.append(row0)
85
- right.append(row1)
86
-
87
- # res = [*zip(left, right)]
88
-
89
- # align each batch using align_func
90
-
91
- # ready align_func
92
- if align_func is None:
93
- align_func = align_sents
94
- if isinstance(align_func, str) and align_func.startswith("shuffle") or not isinstance(align_func, str) and align_func.__name__ in ["shuffle_sents"]:
95
- align_func = lambda row0, row1: shuffle_sents(row0, row1, lang1=lang1, lang2=lang2) # noqa
96
- else:
97
- align_func = align_sents
98
-
99
- res = []
100
- for row0, row1 in zip(left, right):
101
- try:
102
- _ = align_func(row0, row1)
103
- except Exception as exc:
104
- logger.info("probably empty para supplied: %s, resorting to zip_longest", exc)
105
- _ = [*zip_longest(row0, row1, fillvalue="")]
106
-
107
- # res.append(_)
108
- res.extend(_)
109
-
110
- return res