Spaces:
Runtime error
Runtime error
File size: 5,207 Bytes
9fa6437 7d2f336 b5dcb77 7d2f336 710b787 b962a46 b5dcb77 b962a46 7d2f336 9fa6437 7d2f336 710b787 7d2f336 b5dcb77 7d2f336 b5dcb77 7d2f336 862cd29 7d2f336 7bc15eb 9fa6437 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import gradio as gr
import pypandoc
import glob
import shutil
import os
import tqdm
from huggingface_hub import snapshot_download
import tempfile
import re
import pdfminer
print("pdfminer", pdfminer.__version__)
print("pandoc", pypandoc.__version__)
#from docx import Document
#document = Document()
#document.add_heading('Labels for ', level=1)
RESULTS_FOLDER = "./results"
CAT_TO_CODEWORDS = {
"Prejudices": ["prejudice", "judge", "preconceive", "stigma", "assumption", "assume", "misunderstanding", "unexamined", "distorted", "clear", "compar"],
"Self-knowledge": ["self-knowledge", "self-awareness", "introspection", "examined", "myself", "realization", "belief"],
"Similarities": ["similarity", "same", "similar", "equal", "related", "together"],
"Diversity": ["diverse", "different", "diverse", "particular", "range", "multiplicity"],
"Business school": ["ESADE", "competitive", "business school", "education", "study", "university", "student", "consulting", "professional", "pressure", "performance", "institution"],
"Courage": ["courage", "brave", "dare", "step", "determine"],
"Change": ["change", "finally", "at last", "decided", "chose", "concluded", "want to", "swap", "different", "not the same", "replace", "convert", "trade", "future", "decision"],
"Coherence": ["coherent", "align", "incoherent", "consistent"],
"Voicing": ["speak", "express", "voice", "talk", "say", "open up", "articulate", "communicate", "convey", "reveal", "show", "verbalize", "phrase", "word"],
"Listening": ["listen", "pay attention", "quiet", "silence", "process", "hear", "attend"],
"Understanding": ["learn", "understand", "realize", "see", "believe", "question", "critical", "thought", "reasonable", "logical", "rational", "comprehensible", "accept"],
"Relationships": ["relationship", "relate", "bond", "connection", "bond", "others", "appreciate", "appreciation", "recognize", "recognition", "acknowledge"],
"Emotions": ["emotions", "felt", "feel", "a feeling of", "sense", "sensation", "instinct", "sentiment", "gut feeling", "intense", "wave"],
"The course": ["first time", "never", "always", "course", "elective", "Socratic Dialogue", "dialogue", "debate", "enroll", "arguments"],
}
CATEGORIES = CAT_TO_CODEWORDS.keys()
def retrieve_lines(filename):
extension = filename.split(".")[-1]
if extension == "pdf":
text = pdfminer.high_level.extract_text(filename)
lines = text.split("\n")
elif extension in ["docx", "doc"]:
with tempfile.TemporaryDirectory() as tmpdirname:
outfile = os.path.join(tmpdirname, "temp.txt")
pypandoc.convert_file(filename, 'plain', outputfile=outfile)
with open(outfile, "r") as f:
lines = f.readlines()
lines = [l.strip() for l in lines]
lines = " ".join(lines)
lines = lines.split(".")
return lines
def match_code(lines, codewords):
match_dict = {}
keywords_to_match = re.compile(fr'\b(?:{"|".join(codewords)})\b')
for i, _ in enumerate(lines):
line = lines[i]
matches = list(keywords_to_match.finditer(line))
if len(matches) > 0:
for m in matches:
span = m.span()
line = line[:span[0]] + line[span[0]:span[1]].upper() + line[span[1]:]
match_dict[i] = " ".join(line.rstrip().lstrip().split())
return match_dict
def main(filename, codewords_mapping):
lines = retrieve_lines(filename)
for label, codewords in codewords_mapping.items():
match = match_code(lines, codewords)
out = ""
if len(match) > 0:
result_file = ".".join(['_'.join(label.split()), "result", "txt"])
result_file = os.path.join(RESULTS_FOLDER, result_file)
if not os.path.exists(result_file):
out += f"# Code: {label}\n"
out += 25 * "="
out += "\n\n"
out += f"## Source: {filename}\n"
out += 25 * "-"
out += "\n"
out += "\n".join([f'-{v}' for k,v in match.items()])
out += "\n"
out += 25 * "-"
out += "\n\n"
with open(result_file, "a") as f:
f.write(out)
def convert(*keywords):
# cached_folder = snapshot_download("claudiag/atlas", token=os.environ.get("HF_TOKEN"))
codewords_mapping = {k: v for k,v in zip(CATEGORIES, keywords)}
num_files = 0
return "_".join(codewords_mapping.values())
for folder in tqdm.tqdm(glob.glob("./*")):
shutil.rmtree(RESULTS_FOLDER, ignore_errors=True)
os.makedirs(RESULTS_FOLDER)
all_files = tqdm.tqdm(glob.glob(f"./{folder}/*"))
num_files += len(all_files)
for filename in all_files:
try:
main(filename)
except Exception as e:
print(f"{filename} not working because \n {e}")
return f"Retrieved from {num_files}"
inputs = [gr.Textbox(label=f"Enter your keywords for {k}", max_lines=2, placeholder=CAT_TO_CODEWORDS[k]) for k in CATEGORIES]
iface = gr.Interface(
fn=convert, inputs=inputs, outputs="text")
iface.launch()
|