freemt commited on
Commit
a15cd26
1 Parent(s): 5a186f5

Update sent-ali fast and slow

Browse files
img/plt.png CHANGED
radiobee/__main__.py CHANGED
@@ -139,7 +139,6 @@ if __name__ == "__main__":
139
  gr.inputs.File(label="file 2", optional=True),
140
  ]
141
 
142
- # modi 1
143
  _ = """
144
  tf_type: Literal[linear, sqrt, log, binary] = 'linear'
145
  idf_type: Optional[Literal[standard, smooth, bm25]] = None
@@ -159,10 +158,13 @@ if __name__ == "__main__":
159
  ) # ditto
160
  input_norm_type = gr.inputs.Radio(["None", "l1", "l2"], default="None") # ditto
161
 
162
- inputs = [
 
 
 
163
  gr.inputs.File(label="file 1"),
164
  gr.inputs.File(label="file 2", optional=True),
165
- input_tf_type, # modi inputs
166
  input_idf_type,
167
  input_dl_type,
168
  input_norm_type,
@@ -178,6 +180,7 @@ if __name__ == "__main__":
178
  step=1,
179
  default=6,
180
  ),
 
181
  ]
182
 
183
  examples = [
@@ -190,6 +193,29 @@ if __name__ == "__main__":
190
  "None",
191
  10,
192
  6,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  ],
194
  [
195
  "data/test_en.txt",
@@ -200,6 +226,7 @@ if __name__ == "__main__":
200
  "None",
201
  10,
202
  6,
 
203
  ],
204
  [
205
  "data/shakespeare_zh500.txt",
@@ -210,6 +237,7 @@ if __name__ == "__main__":
210
  "None",
211
  10,
212
  6,
 
213
  ],
214
  [
215
  "data/shakespeare_en500.txt",
@@ -220,6 +248,7 @@ if __name__ == "__main__":
220
  "None",
221
  10,
222
  6,
 
223
  ],
224
  [
225
  "data/hlm-ch1-zh.txt",
@@ -230,6 +259,7 @@ if __name__ == "__main__":
230
  "None",
231
  10,
232
  6,
 
233
  ],
234
  [
235
  "data/hlm-ch1-en.txt",
@@ -240,6 +270,7 @@ if __name__ == "__main__":
240
  "None",
241
  10,
242
  6,
 
243
  ],
244
  [
245
  "data/ps-cn.txt",
@@ -250,6 +281,7 @@ if __name__ == "__main__":
250
  "None",
251
  10,
252
  4,
 
253
  ],
254
  [
255
  "data/test-dual.txt",
@@ -260,6 +292,7 @@ if __name__ == "__main__":
260
  "None",
261
  10,
262
  6,
 
263
  ],
264
  [
265
  "data/英译中国现代散文选1(汉外对照丛书).txt",
@@ -270,6 +303,7 @@ if __name__ == "__main__":
270
  "None",
271
  10,
272
  6,
 
273
  ],
274
  [
275
  "data/test-zh-ja.txt",
@@ -280,6 +314,7 @@ if __name__ == "__main__":
280
  "None",
281
  10,
282
  6,
 
283
  ],
284
  [
285
  "data/xiyouji-ch1-zh.txt",
@@ -290,6 +325,7 @@ if __name__ == "__main__":
290
  "None",
291
  10,
292
  6,
 
293
  ],
294
  [
295
  "data/demian-hesse-de.txt",
@@ -300,6 +336,7 @@ if __name__ == "__main__":
300
  "None",
301
  10,
302
  6,
 
303
  ],
304
  [
305
  "data/catcher-in-the-rye-shixianrong-zh.txt",
@@ -310,6 +347,7 @@ if __name__ == "__main__":
310
  "None",
311
  10,
312
  6,
 
313
  ],
314
  ]
315
 
@@ -340,14 +378,23 @@ if __name__ == "__main__":
340
  out_file_dl_excel = gr.outputs.File(
341
  label="Click to download xlsx",
342
  )
 
 
 
 
 
 
 
 
343
 
344
- # modi outputs
345
- outputs = [
346
  out_df,
347
- # "plot",
348
  gr.outputs.Image(label="plot"),
349
  out_file_dl,
350
  out_file_dl_excel,
 
 
351
  out_df_aligned,
352
  gr.outputs.HTML(),
353
  ]
 
139
  gr.inputs.File(label="file 2", optional=True),
140
  ]
141
 
 
142
  _ = """
143
  tf_type: Literal[linear, sqrt, log, binary] = 'linear'
144
  idf_type: Optional[Literal[standard, smooth, bm25]] = None
 
158
  ) # ditto
159
  input_norm_type = gr.inputs.Radio(["None", "l1", "l2"], default="None") # ditto
160
 
161
+ # modi inputs 1, definitions
162
+ sent_ali_algo = gr.inputs.Radio(["None", "fast", "slow"], default="None")
163
+
164
+ inputs = [ # tot. 9, meed to modify input of gradio & examples
165
  gr.inputs.File(label="file 1"),
166
  gr.inputs.File(label="file 2", optional=True),
167
+ input_tf_type, # modi inputs 2
168
  input_idf_type,
169
  input_dl_type,
170
  input_norm_type,
 
180
  step=1,
181
  default=6,
182
  ),
183
+ sent_ali_algo,
184
  ]
185
 
186
  examples = [
 
193
  "None",
194
  10,
195
  6,
196
+ "None",
197
+ ],
198
+ [
199
+ "data/test_zh.txt",
200
+ "data/test_en.txt",
201
+ "linear",
202
+ "None",
203
+ "None",
204
+ "None",
205
+ 10,
206
+ 6,
207
+ "fast",
208
+ ],
209
+ [
210
+ "data/test_zh.txt",
211
+ "data/test_en.txt",
212
+ "linear",
213
+ "None",
214
+ "None",
215
+ "None",
216
+ 10,
217
+ 6,
218
+ "slow",
219
  ],
220
  [
221
  "data/test_en.txt",
 
226
  "None",
227
  10,
228
  6,
229
+ "None",
230
  ],
231
  [
232
  "data/shakespeare_zh500.txt",
 
237
  "None",
238
  10,
239
  6,
240
+ "None",
241
  ],
242
  [
243
  "data/shakespeare_en500.txt",
 
248
  "None",
249
  10,
250
  6,
251
+ "None",
252
  ],
253
  [
254
  "data/hlm-ch1-zh.txt",
 
259
  "None",
260
  10,
261
  6,
262
+ "None",
263
  ],
264
  [
265
  "data/hlm-ch1-en.txt",
 
270
  "None",
271
  10,
272
  6,
273
+ "None",
274
  ],
275
  [
276
  "data/ps-cn.txt",
 
281
  "None",
282
  10,
283
  4,
284
+ "None",
285
  ],
286
  [
287
  "data/test-dual.txt",
 
292
  "None",
293
  10,
294
  6,
295
+ "None",
296
  ],
297
  [
298
  "data/英译中国现代散文选1(汉外对照丛书).txt",
 
303
  "None",
304
  10,
305
  6,
306
+ "None",
307
  ],
308
  [
309
  "data/test-zh-ja.txt",
 
314
  "None",
315
  10,
316
  6,
317
+ "None",
318
  ],
319
  [
320
  "data/xiyouji-ch1-zh.txt",
 
325
  "None",
326
  10,
327
  6,
328
+ "None",
329
  ],
330
  [
331
  "data/demian-hesse-de.txt",
 
336
  "None",
337
  10,
338
  6,
339
+ "None",
340
  ],
341
  [
342
  "data/catcher-in-the-rye-shixianrong-zh.txt",
 
347
  "None",
348
  10,
349
  6,
350
+ "None",
351
  ],
352
  ]
353
 
 
378
  out_file_dl_excel = gr.outputs.File(
379
  label="Click to download xlsx",
380
  )
381
+ out_sents_dl = gr.outputs.File(
382
+ label="Click to download sents csv",
383
+ )
384
+ out_sents_dl_excel = gr.outputs.File(
385
+ label="Click to download sents xlsx",
386
+ )
387
+
388
+ # modi outputs 1, definitions
389
 
390
+ # modi outputs 2, need to modify gradio error_msg
391
+ outputs = [ # tot. 8
392
  out_df,
 
393
  gr.outputs.Image(label="plot"),
394
  out_file_dl,
395
  out_file_dl_excel,
396
+ out_sents_dl,
397
+ out_sents_dl_excel,
398
  out_df_aligned,
399
  gr.outputs.HTML(),
400
  ]
radiobee/align_sents.py DELETED
@@ -1,72 +0,0 @@
1
- """Align sents via gale-church."""
2
- # pylint: disable=invalid-name
3
-
4
- from typing import List, Tuple # noqa
5
-
6
- import re
7
-
8
- # from itertools import tee
9
- # from more_itertools import ilen
10
- from nltk.translate.gale_church import align_blocks
11
-
12
- from radiobee.amend_avec import amend_avec
13
-
14
-
15
- def align_sents(lst1: List[str], lst2: List[str]) -> List[Tuple[str, str]]:
16
- """Align sents.
17
-
18
- >>> lst1, lst2 = ['a', 'bs',], ['aaa', '34', 'a', 'b']
19
- """
20
- if isinstance(lst1, str):
21
- lst1 = [lst1]
22
-
23
- if isinstance(lst2, str):
24
- lst2 = [lst2]
25
-
26
- src_blocks = [len(re.sub(r"\s+", "", elm)) for elm in lst1]
27
- tgt_blocks = [len(re.sub(r"\s+", "", elm)) for elm in lst2]
28
-
29
- avec = align_blocks(src_blocks, tgt_blocks)
30
-
31
- len1, len2 = len(lst1), len(lst2)
32
- # lst1, _ = tee(lst1)
33
- # len1 = ilen(_)
34
- # lst2, _ = tee(lst2)
35
- # len2 = ilen(_)
36
-
37
- amended_avec = amend_avec(avec, len1, len2)
38
-
39
- texts = []
40
- # for elm in aset:
41
- # for elm0, elm1 in amended_avec:
42
- for elm in amended_avec:
43
- # elm0, elm1, elm2 = elm
44
- elm0, elm1 = elm[:2]
45
- _ = []
46
-
47
- # src_text first
48
- if isinstance(elm0, str):
49
- _.append("")
50
- else:
51
- # _.append(src_text[int(elm0)])
52
- _.append(lst1[int(elm0)])
53
-
54
- if isinstance(elm1, str):
55
- _.append("")
56
- else:
57
- # _.append(tgt_text[int(elm0)])
58
- _.append(lst2[int(elm1)])
59
-
60
- _a = """
61
- if isinstance(elm2, str):
62
- _.append("")
63
- else:
64
- _.append(round(elm2, 2))
65
- # """
66
- del _a
67
-
68
- texts.append(tuple(_))
69
-
70
- return texts
71
-
72
- # return ["", ""]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
radiobee/align_sents.pyc ADDED
Binary file (1.55 kB). View file
 
radiobee/error_msg.py CHANGED
@@ -8,7 +8,7 @@ import pandas as pd
8
  def error_msg(
9
  msg: Optional[Union[str, Exception]],
10
  title: str = "error message",
11
- ) -> Tuple[Union[pd.DataFrame, None], None, None, None, None, None]:
12
  """Prepare an error message for gradiobee outputs."""
13
  if msg is None:
14
  msg = "none..."
@@ -21,4 +21,4 @@ def error_msg(
21
  df = pd.DataFrame([msg], columns=[title])
22
 
23
  # return df, *((None,) * 4) # pyright complains
24
- return df, None, None, None, None, None
 
8
  def error_msg(
9
  msg: Optional[Union[str, Exception]],
10
  title: str = "error message",
11
+ ) -> Tuple[Union[pd.DataFrame, None], None, None, None, None, None, None, None]:
12
  """Prepare an error message for gradiobee outputs."""
13
  if msg is None:
14
  msg = "none..."
 
21
  df = pd.DataFrame([msg], columns=[title])
22
 
23
  # return df, *((None,) * 4) # pyright complains
24
+ return df, None, None, None, None, None, None, None
radiobee/gradiobee.py CHANGED
@@ -30,6 +30,10 @@ from radiobee.trim_df import trim_df
30
  from radiobee.error_msg import error_msg
31
  from radiobee.text2lists import text2lists
32
 
 
 
 
 
33
  uname = platform.uname()
34
  HFSPACES = False
35
  if "amzn2" in uname.release: # on hf spaces
@@ -43,7 +47,7 @@ debug = False
43
  debug = True
44
 
45
 
46
- def gradiobee(
47
  file1,
48
  file2,
49
  tf_type,
@@ -53,6 +57,7 @@ def gradiobee(
53
  eps,
54
  min_samples,
55
  # debug=False,
 
56
  ):
57
  """Process inputs and return outputs."""
58
  logger.debug(" *debug* ")
@@ -382,7 +387,7 @@ def gradiobee(
382
  df_aligned = df_aligned[["text2", "text1", "likelihood"]]
383
  df_aligned.columns = ["text1", "text2", "likelihood"]
384
 
385
- ic(df_aligned.head())
386
 
387
  # round the last column to 2
388
  # df_aligned.likelihood = df_aligned.likelihood.round(2)
@@ -434,8 +439,66 @@ def gradiobee(
434
  # return df_trimmed, output_plot, file_dl, file_dl_xlsx, df_aligned
435
  # return df_trimmed, output_plot, file_dl, file_dl_xlsx, styled, df_html # gradio cant handle style
436
 
437
- ic("returning outputs")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
 
439
- return df_trimmed, output_plot, file_dl, file_dl_xlsx, df_aligned, df_html
 
440
 
441
- # modi outputs
 
 
30
  from radiobee.error_msg import error_msg
31
  from radiobee.text2lists import text2lists
32
 
33
+ from radiobee.align_sents import align_sents
34
+ from radiobee.shuffle_sents import shuffle_sents # type: ignore
35
+ from radiobee.paras2sents import paras2sents # type: ignore
36
+
37
  uname = platform.uname()
38
  HFSPACES = False
39
  if "amzn2" in uname.release: # on hf spaces
 
47
  debug = True
48
 
49
 
50
+ def gradiobee( # noqa
51
  file1,
52
  file2,
53
  tf_type,
 
57
  eps,
58
  min_samples,
59
  # debug=False,
60
+ sent_ali_algo,
61
  ):
62
  """Process inputs and return outputs."""
63
  logger.debug(" *debug* ")
 
387
  df_aligned = df_aligned[["text2", "text1", "likelihood"]]
388
  df_aligned.columns = ["text1", "text2", "likelihood"]
389
 
390
+ ic("paras aligned: ", df_aligned.head(10))
391
 
392
  # round the last column to 2
393
  # df_aligned.likelihood = df_aligned.likelihood.round(2)
 
439
  # return df_trimmed, output_plot, file_dl, file_dl_xlsx, df_aligned
440
  # return df_trimmed, output_plot, file_dl, file_dl_xlsx, styled, df_html # gradio cant handle style
441
 
442
+ ic("sent-ali-algo: ", sent_ali_algo)
443
+
444
+ # ### sent-ali-algo is None: para align
445
+ if sent_ali_algo in ["None"]:
446
+ ic("returning para-ali outputs")
447
+ return df_trimmed, output_plot, file_dl, file_dl_xlsx, None, None, df_aligned, df_html
448
+
449
+ # ### proceed with sent align
450
+ if sent_ali_algo in ["fast"]:
451
+ ic(sent_ali_algo)
452
+ align_func = align_sents
453
+
454
+ ic(df_aligned.shape, df_aligned.columns)
455
+
456
+ aligned_sents = paras2sents(df_aligned, align_func)
457
+
458
+ # ic(pd.DataFrame(aligned_sents).shape, aligned_sents)
459
+ ic(pd.DataFrame(aligned_sents).shape)
460
+
461
+ df_aligned_sents = pd.DataFrame(aligned_sents, columns=["text1", "text2"])
462
+ else: # ["slow"]
463
+ ic(sent_ali_algo)
464
+ align_func = shuffle_sents
465
+ aligned_sents = paras2sents(df_aligned, align_func, lang1, lang2)
466
+
467
+ # add extra entry if necessary
468
+ aligned_sents = [list(sent) + [""] if len(sent) == 2 else list(sent) for sent in aligned_sents]
469
+
470
+ df_aligned_sents = pd.DataFrame(aligned_sents, columns=["text1", "text2", "likelihood"])
471
+
472
+ # prepare sents downloads
473
+ file_dl_sents = Path(f"{file_dl.stem}-sents{file_dl.suffix}")
474
+ file_dl_xlsx_sents = Path(f"{file_dl_xlsx.stem}-sents{file_dl_xlsx.suffix}")
475
+ _ = df_aligned_sents.to_csv(index=False)
476
+ file_dl_sents.write_text(_, encoding="utf8")
477
+
478
+ df_aligned_sents.to_excel(file_dl_xlsx_sents)
479
+
480
+ # prepare html output
481
+ if len(df_aligned_sents) > 200:
482
+ df_html = None
483
+ else: # show a one-bathc table in html
484
+ # style
485
+ styled = df_aligned_sents.style.set_properties(
486
+ **{
487
+ "font-size": "10pt",
488
+ "border-color": "black",
489
+ "border": "1px black solid !important"
490
+ }
491
+ # border-color="black",
492
+ ).set_table_styles([{
493
+ "selector": "", # noqs
494
+ "props": [("border", "2px black solid !important")]}] # noqs
495
+ ).format(
496
+ precision=2
497
+ )
498
+ df_html = styled.to_html()
499
 
500
+ # aligned sents outputs
501
+ ic("aligned sents outputs")
502
 
503
+ # return df_trimmed, output_plot, file_dl, file_dl_xlsx, None, None, df_aligned, df_html
504
+ return df_trimmed, output_plot, file_dl, file_dl_xlsx, file_dl_sents, file_dl_xlsx_sents, df_aligned_sents, df_html
radiobee/paras2sents.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Convert paras to sents."""
2
+ # pylint: disable=unused-import, too-many-branches, ungrouped-imports
3
+
4
+ from typing import Callable, List, Optional, Tuple, Union
5
+
6
+ from itertools import zip_longest
7
+ import numpy as np
8
+ import pandas as pd
9
+ from logzero import logger
10
+
11
+ from radiobee.align_sents import align_sents
12
+ from radiobee.seg_text import seg_text
13
+ from radiobee.detect import detect
14
+
15
+ try:
16
+ from radiobee.shuffle_sents import shuffle_sents
17
+ except Exception as exc:
18
+ logger.error("shuffle_sents not available: %s, using align_sents", exc)
19
+ shuffle_sents = lambda x1, x2, lang1="", lang2="": align_sents(x1, x2) # noqa
20
+
21
+
22
+ def paras2sents(
23
+ paras_: Union[pd.DataFrame, List[Tuple[str, str, Union[str, float]]], np.ndarray],
24
+ align_func: Optional[Union[Callable, str]] = None,
25
+ lang1: Optional[str] = None,
26
+ lang2: Optional[str] = None,
27
+ ) -> List[Tuple[str, str, Union[str, float]]]:
28
+ """Convert paras to sents using align_func.
29
+
30
+ Args:
31
+ paras_: list of 3-tuples or numpy or pd.DataFrame
32
+ lang1: fisrt lang code
33
+ lang2: second lang code
34
+ align_func: func used in the sent level
35
+ if set to None, default to align_sents
36
+ Returns:
37
+ list of sents (possible with likelihood for shuffle_sents)
38
+ """
39
+ # wrap everything in pd.DataFrame
40
+ # necessary to make pyright happy
41
+ paras = pd.DataFrame(paras_).fillna("")
42
+
43
+ # take the first three columns at maximum
44
+ paras = paras.iloc[:, :3]
45
+
46
+ if len(paras.columns) < 2:
47
+ logger.error(
48
+ "Need at least two columns, got %s",
49
+ len(paras.columns)
50
+ )
51
+ raise Exception("wrong data")
52
+
53
+ # append the third col (all "") if there are only two cols
54
+ if len(paras.columns) < 3:
55
+ paras.insert(2, "likelihood", [""] * len(paras))
56
+
57
+ if lang1 is None:
58
+ lang1 = detect(" ".join(paras.iloc[:, 0]))
59
+ if lang2 is None:
60
+ lang2 = detect(" ".join(paras.iloc[:, 1]))
61
+
62
+ left, right = [], []
63
+ row0, row1 = [], []
64
+ for elm0, elm1, elm2 in paras.values:
65
+ sents0 = seg_text(elm0, lang1)
66
+ sents1 = seg_text(elm1, lang2)
67
+ if isinstance(elm2, float) and elm2 > 0:
68
+ if row0 or row1:
69
+ left.append(row0)
70
+ right.append(row1)
71
+ row0, row1 = [], [] # collect and prepare
72
+
73
+ if sents0:
74
+ left.append(sents0)
75
+ if sents1:
76
+ right.append(sents1)
77
+ else:
78
+ if sents0:
79
+ row0.extend(sents0)
80
+ if sents1:
81
+ row1.extend(sents1)
82
+ # collect possible last batch
83
+ if row0 or row1:
84
+ left.append(row0)
85
+ right.append(row1)
86
+
87
+ # res = [*zip(left, right)]
88
+
89
+ # align each batch using align_func
90
+
91
+ # ready align_func
92
+ if align_func is None:
93
+ align_func = align_sents
94
+ if isinstance(align_func, str) and align_func.startswith("shuffle") or not isinstance(align_func, str) and align_func.__name__ in ["shuffle_sents"]:
95
+ align_func = lambda row0, row1: shuffle_sents(row0, row1, lang1=lang1, lang2=lang2) # noqa
96
+ else:
97
+ align_func = align_sents
98
+
99
+ res = []
100
+ for row0, row1 in zip(left, right):
101
+ try:
102
+ _ = align_func(row0, row1)
103
+ except Exception as exc:
104
+ logger.info("probably empty para supplied: %s, resorting to zip_longest", exc)
105
+ _ = [*zip_longest(row0, row1, fillvalue="")]
106
+
107
+ # res.append(_)
108
+ res.extend(_)
109
+
110
+ return res
requirements.txt CHANGED
@@ -23,6 +23,7 @@ pyicu
23
  pycld2
24
  tqdm
25
  polyglot
 
26
  sentence_splitter
27
  icecream
28
  # lazy
 
23
  pycld2
24
  tqdm
25
  polyglot
26
+ nltk
27
  sentence_splitter
28
  icecream
29
  # lazy
tests/test_paras2sents.py CHANGED
@@ -1,6 +1,7 @@
1
  """Test paras2sents."""
2
  # pylint: disable=invalid-name
3
 
 
4
  import pandas as pd
5
  from radiobee.paras2sents import paras2sents
6
  from radiobee.shuffle_sents import shuffle_sents
@@ -14,15 +15,20 @@ def test_paras2sents_dual():
14
  """Test paras2sents_dual."""
15
  sents = paras2sents(paras)
16
 
 
 
17
  assert len(sents) > 202 # 208
18
  # assert not sents
19
 
20
 
21
  def test_paras2sents_dual_model_s():
22
  """Test paras2sents_dual_model_s."""
23
- sents = paras2sents(paras, shuffle_sents)
 
 
 
24
 
25
- assert len(sents) > 201 # 207
26
  # assert not sents
27
 
28
 
 
1
  """Test paras2sents."""
2
  # pylint: disable=invalid-name
3
 
4
+ import numpy as np
5
  import pandas as pd
6
  from radiobee.paras2sents import paras2sents
7
  from radiobee.shuffle_sents import shuffle_sents
 
15
  """Test paras2sents_dual."""
16
  sents = paras2sents(paras)
17
 
18
+ assert np.array(sents).shape.__len__() > 1
19
+
20
  assert len(sents) > 202 # 208
21
  # assert not sents
22
 
23
 
24
  def test_paras2sents_dual_model_s():
25
  """Test paras2sents_dual_model_s."""
26
+ sents1 = paras2sents(paras, shuffle_sents)
27
+
28
+ # assert np.array(sents1).shape.__len__() > 1
29
+ assert pd.DataFrame(sents1).shape.__len__() > 1
30
 
31
+ assert len(sents1) > 201 # 207
32
  # assert not sents
33
 
34