import gradio as gr import pypandoc import glob import shutil import os import tqdm from huggingface_hub import snapshot_download from huggingface_hub import HfApi import tempfile import re from pdfminer.high_level import extract_text import time HF_TOKEN = os.environ.get("HF_TOKEN") api = HfApi() #from docx import Document #document = Document() #document.add_heading('Labels for ', level=1) RESULTS_FOLDER = "./results" DOC_FOLDER = snapshot_download("claudiag/atlas", token=HF_TOKEN, repo_type="dataset") CAT_TO_CODEWORDS = { "Prejudices": ["prejudice", "judge", "preconceive", "stigma", "assumption", "assume", "misunderstanding", "unexamined", "distorted", "clear", "compar"], "Self-knowledge": ["self-knowledge", "self-awareness", "introspection", "examined", "myself", "realization", "belief"], "Similarities": ["similarity", "same", "similar", "equal", "related", "together"], "Diversity": ["diverse", "different", "diverse", "particular", "range", "multiplicity"], "Business school": ["ESADE", "competitive", "business school", "education", "study", "university", "student", "consulting", "professional", "pressure", "performance", "institution"], "Courage": ["courage", "brave", "dare", "step", "determine"], "Change": ["change", "finally", "at last", "decided", "chose", "concluded", "want to", "swap", "different", "not the same", "replace", "convert", "trade", "future", "decision"], "Coherence": ["coherent", "align", "incoherent", "consistent"], "Voicing": ["speak", "express", "voice", "talk", "say", "open up", "articulate", "communicate", "convey", "reveal", "show", "verbalize", "phrase", "word"], "Listening": ["listen", "pay attention", "quiet", "silence", "process", "hear", "attend"], "Understanding": ["learn", "understand", "realize", "see", "believe", "question", "critical", "thought", "reasonable", "logical", "rational", "comprehensible", "accept"], "Relationships": ["relationship", "relate", "bond", "connection", "bond", "others", "appreciate", "appreciation", "recognize", "recognition", "acknowledge"], "Emotions": ["emotions", "felt", "feel", "a feeling of", "sense", "sensation", "instinct", "sentiment", "gut feeling", "intense", "wave"], "The course": ["first time", "never", "always", "course", "elective", "Socratic Dialogue", "dialogue", "debate", "enroll", "arguments"], } CATEGORIES = CAT_TO_CODEWORDS.keys() def retrieve_lines(filename): extension = filename.split(".")[-1] if extension == "pdf": text = extract_text(filename) lines = text.split("\n") elif extension in ["docx", "doc"]: with tempfile.TemporaryDirectory() as tmpdirname: outfile = os.path.join(tmpdirname, "temp.txt") pypandoc.convert_file(filename, 'plain', outputfile=outfile) with open(outfile, "r") as f: lines = f.readlines() lines = [l.strip() for l in lines] lines = " ".join(lines) lines = lines.split(".") return lines def match_code(lines, codewords): match_dict = {} keywords_to_match = re.compile(fr'\b(?:{"|".join(codewords)})\b') for i, _ in enumerate(lines): line = lines[i] matches = list(keywords_to_match.finditer(line)) if len(matches) > 0: for m in matches: span = m.span() line = line[:span[0]] + line[span[0]:span[1]].upper() + line[span[1]:] match_dict[i] = " ".join(line.rstrip().lstrip().split()) return match_dict def main(filename, codewords_mapping): lines = retrieve_lines(filename) files = [] for label, codewords in codewords_mapping.items(): match = match_code(lines, codewords) out = "" if len(match) > 0: result_file = ".".join(['_'.join(label.split()), "result", "txt"]) result_file = os.path.join(RESULTS_FOLDER, result_file) if not os.path.exists(result_file): out += f"# Code: {label}\n" out += 25 * "=" out += "\n\n" out += f"## Source: {'/'.join(filename.split('/')[-2:])}\n" out += 25 * "-" out += "\n" out += "\n".join([f'-{v}' for k,v in match.items()]) out += "\n" out += 25 * "-" out += "\n\n" with open(result_file, "a") as f: f.write(out) files.append(result_file) return files def convert(*keywords): codewords_mapping = {k: v for k,v in zip(CATEGORIES, keywords)} num_files = 0 shutil.rmtree(RESULTS_FOLDER, ignore_errors=True) os.makedirs(RESULTS_FOLDER) result_files = [] folders = glob.glob(os.path.join(DOC_FOLDER, "*")) for folder in tqdm.tqdm(folders): all_files = tqdm.tqdm(glob.glob(f"{folder}/*")) num_files += len(all_files) for filename in all_files: try: result_files += main(filename, codewords_mapping) except Exception as e: print(f"{filename} not working because \n {e}") result_files = list(set(result_files)) api.upload_folder( repo_id="patrickvonplaten/atlas", folder_path=RESULTS_FOLDER, path_in_repo=f"results_{time.time()}", repo_type="dataset", token=HF_TOKEN, ) return f"Done. Processed {num_files} files." inputs = [gr.Textbox(label=f"Enter your keywords for {k}", max_lines=2, placeholder=CAT_TO_CODEWORDS[k], value=",".join(CAT_TO_CODEWORDS[k])) for k in CATEGORIES] iface = gr.Interface( fn=convert, inputs=inputs, outputs="text") iface.launch()