pszemraj commited on
Commit
c1cba4f
1 Parent(s): ca983bc

⚰️ 🎨 clean up and rm verbose testing code

Browse files

Signed-off-by: peter szemraj <[email protected]>

Files changed (3) hide show
  1. aggregate.py +1 -1
  2. app.py +2 -23
  3. utils.py +1 -1
aggregate.py CHANGED
@@ -7,8 +7,8 @@ How it works:
7
  2. The language model does it.
8
  3. Yaay!
9
  """
10
- import pprint as pp
11
  import logging
 
12
  import time
13
 
14
  import torch
 
7
  2. The language model does it.
8
  3. Yaay!
9
  """
 
10
  import logging
11
+ import pprint as pp
12
  import time
13
 
14
  import torch
app.py CHANGED
@@ -19,9 +19,9 @@ import contextlib
19
  import gc
20
  import logging
21
  import os
 
22
  import random
23
  import re
24
- import pprint as pp
25
  import sys
26
  import time
27
  from pathlib import Path
@@ -47,13 +47,12 @@ from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
47
  from utils import (
48
  contraction_aware_tokenize,
49
  extract_batches,
50
- extract_keywords,
51
  load_example_filenames,
52
  remove_stagnant_files,
 
53
  saves_summary,
54
  textlist2html,
55
  truncate_word_count,
56
- remove_stopwords,
57
  )
58
 
59
  _here = Path(__file__).parent
@@ -268,22 +267,6 @@ def proc_submission(
268
  model_input_text = truncation_validated["processed_text"]
269
  msg = None
270
 
271
- if predrop_stopwords:
272
- # TODO: remove this
273
-
274
- outdir = Path.cwd() / "scratch" / "predrop_stopwords-v4"
275
- outdir.mkdir(parents=True, exist_ok=True)
276
- keywords_cln = " ".join(extract_keywords(cln_text, kw_max_len=4))
277
- keywords_sw_removed = "_".join(extract_keywords(model_input_text, kw_max_len=4))
278
- cln_filename = f"{keywords_cln}_{len(cln_text)}.txt"
279
- cln_outdir = outdir.parent / "source-text"
280
- cln_outdir.mkdir(parents=True, exist_ok=True)
281
- with open(cln_outdir / cln_filename, "w", encoding="utf-8") as f:
282
- f.write(cln_text)
283
- sw_rm_filename = f"{keywords_sw_removed}_{len(model_input_text)}.txt"
284
- with open(outdir / sw_rm_filename, "w", encoding="utf-8") as f:
285
- f.write(model_input_text)
286
- logging.info(f"saved predrop_stopwords file to {outdir / sw_rm_filename}")
287
  if len(input_text) < 50:
288
  # this is essentially a different case from the above
289
  msg = f"""
@@ -326,7 +309,6 @@ def proc_submission(
326
 
327
  html += ""
328
 
329
- # save to file
330
  settings["remove_stopwords"] = predrop_stopwords
331
  settings["model_name"] = model_name
332
  saved_file = saves_summary(summarize_output=_summaries, outpath=None, **settings)
@@ -460,9 +442,6 @@ def parse_args():
460
  choices=["DEBUG", "INFO", "WARNING", "ERROR"],
461
  help="Set the logging level",
462
  )
463
- # if "--help" in sys.argv or "-h" in sys.argv:
464
- # parser.print_help()
465
- # sys.exit(0)
466
 
467
  return parser.parse_args()
468
 
 
19
  import gc
20
  import logging
21
  import os
22
+ import pprint as pp
23
  import random
24
  import re
 
25
  import sys
26
  import time
27
  from pathlib import Path
 
47
  from utils import (
48
  contraction_aware_tokenize,
49
  extract_batches,
 
50
  load_example_filenames,
51
  remove_stagnant_files,
52
+ remove_stopwords,
53
  saves_summary,
54
  textlist2html,
55
  truncate_word_count,
 
56
  )
57
 
58
  _here = Path(__file__).parent
 
267
  model_input_text = truncation_validated["processed_text"]
268
  msg = None
269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  if len(input_text) < 50:
271
  # this is essentially a different case from the above
272
  msg = f"""
 
309
 
310
  html += ""
311
 
 
312
  settings["remove_stopwords"] = predrop_stopwords
313
  settings["model_name"] = model_name
314
  saved_file = saves_summary(summarize_output=_summaries, outpath=None, **settings)
 
442
  choices=["DEBUG", "INFO", "WARNING", "ERROR"],
443
  help="Set the logging level",
444
  )
 
 
 
445
 
446
  return parser.parse_args()
447
 
utils.py CHANGED
@@ -19,7 +19,7 @@ logging.basicConfig(
19
 
20
  import torch
21
  from natsort import natsorted
22
- from nltk.tokenize import word_tokenize, WhitespaceTokenizer, sent_tokenize
23
  from rapidfuzz import fuzz
24
 
25
  STOPWORDS = set(
 
19
 
20
  import torch
21
  from natsort import natsorted
22
+ from nltk.tokenize import WhitespaceTokenizer, sent_tokenize, word_tokenize
23
  from rapidfuzz import fuzz
24
 
25
  STOPWORDS = set(