ljyflores commited on
Commit
facce4e
1 Parent(s): 850fcc9

Split report into sentences

Browse files
__pycache__/utils_casemaker.cpython-310.pyc CHANGED
Binary files a/__pycache__/utils_casemaker.cpython-310.pyc and b/__pycache__/utils_casemaker.cpython-310.pyc differ
 
utils_casemaker.py CHANGED
@@ -34,11 +34,20 @@ def clean(s: str) -> str:
34
  return s
35
 
36
 
37
- def split_paragraphs(text: str) -> List[str]:
 
38
  paragraphs = text.split("\n\n")
39
  paragraphs = list(map(clean, paragraphs))
40
  paragraphs = list(filter(lambda s: len(s.split()) > 10, paragraphs))
41
- return paragraphs
 
 
 
 
 
 
 
 
42
 
43
 
44
  def format_casemaker_data(
@@ -137,8 +146,8 @@ class CaseMaker:
137
  """
138
  report_string_by_organ = dict[str, str]()
139
 
140
- # Split the report into a list of paragraphs
141
- paragraphs = split_paragraphs(report)
142
  # Collect a list of paragraphs related to each organ
143
  for p in paragraphs:
144
  # Figure out which organ is being referenced
 
34
  return s
35
 
36
 
37
+ def split_into_sentences(text: str):
38
+ # Split paragraphs
39
  paragraphs = text.split("\n\n")
40
  paragraphs = list(map(clean, paragraphs))
41
  paragraphs = list(filter(lambda s: len(s.split()) > 10, paragraphs))
42
+
43
+ # Split into sentences
44
+ sentences = [sent_tokenize(p) for p in paragraphs]
45
+ sentences = [
46
+ sent
47
+ for lst in sentences
48
+ for sent in lst if isinstance(sent, str)
49
+ ]
50
+ return sentences
51
 
52
 
53
  def format_casemaker_data(
 
146
  """
147
  report_string_by_organ = dict[str, str]()
148
 
149
+ # Split the report into a list of sentences
150
+ paragraphs = split_into_sentences(report)
151
  # Collect a list of paragraphs related to each organ
152
  for p in paragraphs:
153
  # Figure out which organ is being referenced