Spaces:
Runtime error
Runtime error
File size: 6,911 Bytes
9fa6437 7d2f336 b962a46 7d2f336 9fa6437 7d2f336 9fa6437 7d2f336 9fa6437 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import gradio as gr
import pypandoc
import glob
import shutil
import os
import tqdm
import tempfile
import re
print("pdfminer", print(pdfminer.__version__))
from pdfminer.high_level import extract_text
#from docx import Document
#document = Document()
#document.add_heading('Labels for ', level=1)
RESULTS_FOLDER = "./results"
CAT_TO_CODEWORDS = {
"Prejudices": ["prejudice", "judge", "preconceive", "stigma", "assumption", "assume", "misunderstanding", "unexamined", "distorted", "clear", "compar"],
"Self-knowledge": ["self-knowledge", "self-awareness", "introspection", "examined", "myself", "realization", "belief"],
"Similarities": ["similarity", "same", "similar", "equal", "related", "together"],
"Diversity": ["diverse", "different", "diverse", "particular", "range", "multiplicity"],
"Business school": ["ESADE", "competitive", "business school", "education", "study", "university", "student", "consulting", "professional", "pressure", "performance", "institution"],
"Courage": ["courage", "brave", "dare", "step", "determine"],
"Change": ["change", "finally", "at last", "decided", "chose", "concluded", "want to", "swap", "different", "not the same", "replace", "convert", "trade", "future", "decision"],
"Coherence": ["coherent", "align", "incoherent", "consistent"],
"Voicing": ["speak", "express", "voice", "talk", "say", "open up", "articulate", "communicate", "convey", "reveal", "show", "verbalize", "phrase", "word"],
"Listening": ["listen", "pay attention", "quiet", "silence", "process", "hear", "attend"],
"Understanding": ["learn", "understand", "realize", "see", "believe", "question", "critical", "thought", "reasonable", "logical", "rational", "comprehensible", "accept"],
"Relationships": ["relationship", "relate", "bond", "connection", "bond", "others", "appreciate", "appreciation", "recognize", "recognition", "acknowledge"],
"Emotions": ["emotions", "felt", "feel", "a feeling of", "sense", "sensation", "instinct", "sentiment", "gut feeling", "intense", "wave"],
"The course": ["first time", "never", "always", "course", "elective", "Socratic Dialogue", "dialogue", "debate", "enroll", "arguments"],
}
CAT_TO_CODEWORDS = {
"Prejudices": ["prejudice", "judge", "preconceive", "stigma", "assumption", "assume", "misunderstanding", "unexamined", "distorted", "clear", "compar"],
"Self-knowledge": ["self-knowledge", "self-awareness", "introspection", "examined", "myself", "realization", "belief"],
"Similarities": ["similarity", "same", "similar", "equal", "related", "together"],
"Diversity": ["diverse", "different", "diverse", "particular", "range", "multiplicity"],
"Business school": ["ESADE", "competitive", "business school", "education", "study", "university", "student", "consulting", "professional", "pressure", "performance", "institution"],
"Courage": ["courage", "brave", "dare", "step", "determine"],
"Change": ["change", "finally", "at last", "decided", "chose", "concluded", "want to", "swap", "different", "not the same", "replace", "convert", "trade", "future", "decision"],
"Coherence": ["coherent", "align", "incoherent", "consistent"],
"Voicing": ["speak", "express", "voice", "talk", "say", "open up", "articulate", "communicate", "convey", "reveal", "show", "verbalize", "phrase", "word"],
"Listening": ["listen", "pay attention", "quiet", "silence", "process", "hear", "attend"],
"Understanding": ["learn", "understand", "realize", "see", "believe", "question", "critical", "thought", "reasonable", "logical", "rational", "comprehensible", "accept"],
"Relationships": ["relationship", "relate", "bond", "connection", "bond", "others", "appreciate", "appreciation", "recognize", "recognition", "acknowledge"],
"Emotions": ["emotions", "felt", "feel", "a feeling of", "sense", "sensation", "instinct", "sentiment", "gut feeling", "intense", "wave"],
"The course": ["first time", "never", "always", "course", "elective", "Socratic Dialogue", "dialogue", "debate", "enroll", "arguments"],
}
CATEGORIES = CAT_TO_CODEWORDS.keys()
def retrieve_lines(filename):
extension = filename.split(".")[-1]
if extension == "pdf":
text = extract_text(filename)
lines = text.split("\n")
elif extension in ["docx", "doc"]:
with tempfile.TemporaryDirectory() as tmpdirname:
outfile = os.path.join(tmpdirname, "temp.txt")
pypandoc.convert_file(filename, 'plain', outputfile=outfile)
with open(outfile, "r") as f:
lines = f.readlines()
lines = [l.strip() for l in lines]
lines = " ".join(lines)
lines = lines.split(".")
return lines
def match_code(lines, codewords):
match_dict = {}
keywords_to_match = re.compile(fr'\b(?:{"|".join(codewords)})\b')
for i, _ in enumerate(lines):
line = lines[i]
matches = list(keywords_to_match.finditer(line))
if len(matches) > 0:
for m in matches:
span = m.span()
line = line[:span[0]] + line[span[0]:span[1]].upper() + line[span[1]:]
match_dict[i] = " ".join(line.rstrip().lstrip().split())
return match_dict
def main(filename, codewords_mapping):
lines = retrieve_lines(filename)
for label, codewords in codewords_mapping.items():
match = match_code(lines, codewords)
out = ""
if len(match) > 0:
result_file = ".".join(['_'.join(label.split()), "result", "txt"])
result_file = os.path.join(RESULTS_FOLDER, result_file)
if not os.path.exists(result_file):
out += f"# Code: {label}\n"
out += 25 * "="
out += "\n\n"
out += f"## Source: {filename}\n"
out += 25 * "-"
out += "\n"
out += "\n".join([f'-{v}' for k,v in match.items()])
out += "\n"
out += 25 * "-"
out += "\n\n"
with open(result_file, "a") as f:
f.write(out)
def convert(*keywords):
codewords_mapping = {k: v for k,v in zip(CATEGORIES, keywords)}
num_files = 0
print(codewords_mapping)
return "Yes"
for folder in tqdm.tqdm(glob.glob("./*")):
shutil.rmtree(RESULTS_FOLDER, ignore_errors=True)
os.makedirs(RESULTS_FOLDER)
all_files = tqdm.tqdm(glob.glob(f"./{folder}/*"))
num_files += len(all_files)
for filename in all_files:
try:
main(filename)
except Exception as e:
print(f"{filename} not working because \n {e}")
return f"Retrieved from {num_files}"
inputs = [gr.Textbox(label=f"Enter your keywords for {k}", max_lines=2, placeholders=CAT_TO_CODEWORDS[k]) for k in CATEGORIES]
iface = gr.Interface(
fn=greet, inputs=inputs, outputs="text")
iface.launch()
|