Spaces:
Runtime error
Runtime error
File size: 5,668 Bytes
9fa6437 7d2f336 b5dcb77 7434749 7d2f336 7434749 14aa4ce b962a46 d71a81f 7d2f336 9fa6437 d71a81f a87bd77 7d2f336 7434749 7d2f336 9fec05e 7d2f336 7434749 7d2f336 9fec05e 7d2f336 9fec05e 7d2f336 9fec05e 7434749 7d2f336 14aa4ce 7434749 14aa4ce 7d2f336 1fb0efd 7434749 3b3304f 1fb0efd d71a81f 35712e5 7d2f336 7bc15eb 9fa6437 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import gradio as gr
import pypandoc
import glob
import shutil
import os
import tqdm
from huggingface_hub import snapshot_download
from huggingface_hub import HfApi
import tempfile
import re
from pdfminer.high_level import extract_text
import time
HF_TOKEN = os.environ.get("HF_TOKEN")
api = HfApi()
#from docx import Document
#document = Document()
#document.add_heading('Labels for ', level=1)
RESULTS_FOLDER = "./results"
DOC_FOLDER = snapshot_download("claudiag/atlas", token=HF_TOKEN, repo_type="dataset")
CAT_TO_CODEWORDS = {
"Prejudices": ["prejudice", "judge", "preconceive", "stigma", "assumption", "assume", "misunderstanding", "unexamined", "distorted", "clear", "compar"],
"Self-knowledge": ["self-knowledge", "self-awareness", "introspection", "examined", "myself", "realization", "belief"],
"Similarities": ["similarity", "same", "similar", "equal", "related", "together"],
"Diversity": ["diverse", "different", "diverse", "particular", "range", "multiplicity"],
"Business school": ["ESADE", "competitive", "business school", "education", "study", "university", "student", "consulting", "professional", "pressure", "performance", "institution"],
"Courage": ["courage", "brave", "dare", "step", "determine"],
"Change": ["change", "finally", "at last", "decided", "chose", "concluded", "want to", "swap", "different", "not the same", "replace", "convert", "trade", "future", "decision"],
"Coherence": ["coherent", "align", "incoherent", "consistent"],
"Voicing": ["speak", "express", "voice", "talk", "say", "open up", "articulate", "communicate", "convey", "reveal", "show", "verbalize", "phrase", "word"],
"Listening": ["listen", "pay attention", "quiet", "silence", "process", "hear", "attend"],
"Understanding": ["learn", "understand", "realize", "see", "believe", "question", "critical", "thought", "reasonable", "logical", "rational", "comprehensible", "accept"],
"Relationships": ["relationship", "relate", "bond", "connection", "bond", "others", "appreciate", "appreciation", "recognize", "recognition", "acknowledge"],
"Emotions": ["emotions", "felt", "feel", "a feeling of", "sense", "sensation", "instinct", "sentiment", "gut feeling", "intense", "wave"],
"The course": ["first time", "never", "always", "course", "elective", "Socratic Dialogue", "dialogue", "debate", "enroll", "arguments"],
}
CATEGORIES = CAT_TO_CODEWORDS.keys()
def retrieve_lines(filename):
extension = filename.split(".")[-1]
if extension == "pdf":
text = extract_text(filename)
lines = text.split("\n")
elif extension in ["docx", "doc"]:
with tempfile.TemporaryDirectory() as tmpdirname:
outfile = os.path.join(tmpdirname, "temp.txt")
pypandoc.convert_file(filename, 'plain', outputfile=outfile)
with open(outfile, "r") as f:
lines = f.readlines()
lines = [l.strip() for l in lines]
lines = " ".join(lines)
lines = lines.split(".")
return lines
def match_code(lines, codewords):
match_dict = {}
keywords_to_match = re.compile(fr'\b(?:{"|".join(codewords)})\b')
for i, _ in enumerate(lines):
line = lines[i]
matches = list(keywords_to_match.finditer(line))
if len(matches) > 0:
for m in matches:
span = m.span()
line = line[:span[0]] + line[span[0]:span[1]].upper() + line[span[1]:]
match_dict[i] = " ".join(line.rstrip().lstrip().split())
return match_dict
def main(filename, codewords_mapping):
lines = retrieve_lines(filename)
files = []
for label, codewords in codewords_mapping.items():
match = match_code(lines, codewords)
out = ""
if len(match) > 0:
result_file = ".".join(['_'.join(label.split()), "result", "txt"])
result_file = os.path.join(RESULTS_FOLDER, result_file)
if not os.path.exists(result_file):
out += f"# Code: {label}\n"
out += 25 * "="
out += "\n\n"
out += f"## Source: {'/'.join(filename.split('/')[-2:])}\n"
out += 25 * "-"
out += "\n"
out += "\n".join([f'-{v}' for k,v in match.items()])
out += "\n"
out += 25 * "-"
out += "\n\n"
with open(result_file, "a") as f:
f.write(out)
files.append(result_file)
return files
def convert(*keywords):
codewords_mapping = {k: v for k,v in zip(CATEGORIES, keywords)}
num_files = 0
shutil.rmtree(RESULTS_FOLDER, ignore_errors=True)
os.makedirs(RESULTS_FOLDER)
result_files = []
folders = glob.glob(os.path.join(DOC_FOLDER, "*"))
for folder in tqdm.tqdm(folders):
all_files = tqdm.tqdm(glob.glob(f"{folder}/*"))
num_files += len(all_files)
for filename in all_files:
try:
result_files += main(filename, codewords_mapping)
except Exception as e:
print(f"{filename} not working because \n {e}")
result_files = list(set(result_files))
api.upload_folder(
repo_id="patrickvonplaten/atlas",
folder_path=RESULTS_FOLDER,
path_in_repo=f"results_{time.time()}",
repo_type="dataset",
token=HF_TOKEN,
)
return f"Done. Processed {num_files} files."
inputs = [gr.Textbox(label=f"Enter your keywords for {k}", max_lines=2, placeholder=CAT_TO_CODEWORDS[k], value=",".join(CAT_TO_CODEWORDS[k])) for k in CATEGORIES]
iface = gr.Interface(
fn=convert, inputs=inputs, outputs="text")
iface.launch()
|