|
import re, os, sys
|
|
from utils import *
|
|
|
|
def get_paragraphs(text, cutoff=10):
|
|
return [ x.strip() for x in re.split(r'\n+', text, flags=re.MULTILINE) if len(x.strip()) > cutoff ]
|
|
|
|
def get_para_sentences(text, cutoff=60):
|
|
para_sents = []
|
|
for para in get_paragraphs(text):
|
|
sents = []; sent = ""
|
|
chunks = re.split(r'\.+', para); n = len(chunks)
|
|
for i in range(0, n):
|
|
sent += chunks[i]
|
|
if i < n - 1: sent += "."
|
|
if len(sent) > cutoff:
|
|
sents.append(sent)
|
|
sent = ""
|
|
if len(sent) > 0: sents.append(sent)
|
|
|
|
para_sents.append(sents)
|
|
return para_sents
|
|
|
|
def get_idx_from_marked_chunk(marked_chunk):
|
|
return int(re.match(r'<C\s*(\d+)>', marked_chunk)[1])
|
|
import random; idx = random.randint(0, 99999)
|
|
assert get_idx_from_marked_chunk(f"<C {idx}> ha ha") == idx
|
|
|
|
|
|
def add_chunk_markers(text, lookup_idx = None, para = True):
|
|
if para: para_chunks = get_paragraphs(text)
|
|
else: para_chunks = get_para_sentences(text)
|
|
|
|
marked_text = ""; chunk_idx = 0
|
|
for chunks in para_chunks:
|
|
if isinstance(chunks, str): chunks = [chunks]
|
|
for idx, chunk in enumerate(chunks):
|
|
marked_chunk = f"<C {chunk_idx}>{chunk.strip()}"
|
|
|
|
chunks[idx] = marked_chunk
|
|
if lookup_idx == chunk_idx: print(marked_chunk); sys.exit()
|
|
|
|
marked_text += f"{marked_chunk}\n"
|
|
chunk_idx += 1
|
|
marked_text += "\n"
|
|
return marked_text.strip(), para_chunks
|
|
|
|
|
|
alphabet = '[0-9a-zaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵ]'
|
|
word = re.compile(f'{alphabet}+', re.IGNORECASE)
|
|
|
|
def hilite(query, source, hilite_color=YELLOW, source_color=GREY, query_color=None):
|
|
for keyword in set(re.findall(word, query)):
|
|
keyword = re.escape(keyword)
|
|
re_keyword = re.compile(rf"(\b{keyword}\b)", flags=re.IGNORECASE | re.MULTILINE)
|
|
if re_keyword.search(source):
|
|
source = re.sub(re_keyword, rf'{hilite_color}\1{source_color}', source)
|
|
if query_color is not None:
|
|
query = re.sub(re_keyword, rf'{hilite_color}\1{query_color}', query)
|
|
return source, query
|
|
|
|
|
|
def pretty_num(x):
|
|
return round(x*100)/100
|
|
|
|
def count_words(x):
|
|
assert isinstance(x, str), f"đầu không phải string {x}"
|
|
return len(x.split())
|
|
|
|
def extract_(text, tag):
|
|
raw = text.split(f"</{tag}>")[0].split(f"<{tag}>")[-1]
|
|
if tag == "summary": return raw.strip()
|
|
splits = re.split(r'[\n,]+', raw)
|
|
splits = [ re.sub(r'^\s*-\s*', '', s).strip() for s in splits ]
|
|
splits = [ s for s in splits if len(s) > 0 ]
|
|
return splits
|
|
|
|
def extract_xmls(text, tags):
|
|
if text is None: return None
|
|
return { tag: extract_(text, tag) for tag in tags }
|
|
|