tomtat / text_utils.py
tiendung's picture
new files
cec26ce
raw
history blame
3.99 kB
import re, os, sys
from utils import *
def get_paragraphs(text, cutoff=10):
return [ x.strip() for x in re.split(r'\n+', text, flags=re.MULTILINE) if len(x.strip()) > cutoff ]
def get_para_sentences(text, cutoff=60):
para_sents = []
for para in get_paragraphs(text):
sents = []; sent = ""
chunks = re.split(r'\.+', para); n = len(chunks)
for i in range(0, n):
sent += chunks[i]
if i < n - 1: sent += "."
if len(sent) > cutoff:
sents.append(sent)
sent = ""
if len(sent) > 0: sents.append(sent)
# print(sents); input()
para_sents.append(sents)
return para_sents
def get_idx_from_marked_chunk(marked_chunk):
return int(re.match(r'<C\s*(\d+)>', marked_chunk)[1])
import random; idx = random.randint(0, 99999)
assert get_idx_from_marked_chunk(f"<C {idx}> ha ha") == idx
def add_chunk_markers(text, lookup_idx = None, para = True):
if para: para_chunks = get_paragraphs(text)
else: para_chunks = get_para_sentences(text)
marked_text = ""; chunk_idx = 0
for chunks in para_chunks:
if isinstance(chunks, str): chunks = [chunks]
for idx, chunk in enumerate(chunks):
marked_chunk = f"<C {chunk_idx}>{chunk.strip()}"
chunks[idx] = marked_chunk
if lookup_idx == chunk_idx: print(marked_chunk); sys.exit() # assert False, f"Đã tìm thấy {lookup_idx}"
marked_text += f"{marked_chunk}\n"
chunk_idx += 1
marked_text += "\n"
return marked_text.strip(), para_chunks
alphabet = '[0-9a-zaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵ]'
word = re.compile(f'{alphabet}+', re.IGNORECASE)
###
def hilite(query, source, hilite_color=YELLOW, source_color=GREY, query_color=None):
for keyword in set(re.findall(word, query)):
keyword = re.escape(keyword)
re_keyword = re.compile(rf"(\b{keyword}\b)", flags=re.IGNORECASE | re.MULTILINE)
if re_keyword.search(source):
source = re.sub(re_keyword, rf'{hilite_color}\1{source_color}', source)
if query_color is not None:
query = re.sub(re_keyword, rf'{hilite_color}\1{query_color}', query)
return source, query
def pretty_num(x):
return round(x*100)/100
def count_words(x):
assert isinstance(x, str), f"đầu không phải string {x}"
return len(x.split())
def extract_(text, tag):
raw = text.split(f"</{tag}>")[0].split(f"<{tag}>")[-1]
if tag == "summary": return raw.strip()
splits = re.split(r'[\n,]+', raw)
splits = [ re.sub(r'^\s*-\s*', '', s).strip() for s in splits ]
splits = [ s for s in splits if len(s) > 0 ]
return splits
def extract_xmls(text, tags):
if text is None: return None
return { tag: extract_(text, tag) for tag in tags }