File size: 5,668 Bytes
9fa6437
7d2f336
 
 
 
 
b5dcb77
7434749
7d2f336
 
7434749
14aa4ce
b962a46
d71a81f
 
 
 
 
7d2f336
 
 
 
9fa6437
d71a81f
a87bd77
7d2f336
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7434749
7d2f336
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9fec05e
7d2f336
 
 
 
 
 
 
 
 
 
 
 
 
7434749
7d2f336
 
 
 
 
 
 
 
 
 
9fec05e
 
 
 
 
7d2f336
 
 
 
 
9fec05e
 
7d2f336
9fec05e
7434749
 
 
 
 
7d2f336
 
 
14aa4ce
7434749
14aa4ce
 
7d2f336
1fb0efd
 
7434749
 
 
 
 
 
 
 
3b3304f
1fb0efd
d71a81f
35712e5
7d2f336
 
7bc15eb
9fa6437
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import gradio as gr
import pypandoc
import glob
import shutil
import os
import tqdm
from huggingface_hub import snapshot_download
from huggingface_hub import HfApi
import tempfile
import re
from pdfminer.high_level import extract_text
import time

HF_TOKEN = os.environ.get("HF_TOKEN")

api = HfApi()


#from docx import Document
#document = Document()
#document.add_heading('Labels for ', level=1)
RESULTS_FOLDER = "./results"

DOC_FOLDER = snapshot_download("claudiag/atlas", token=HF_TOKEN, repo_type="dataset")

CAT_TO_CODEWORDS = {
    "Prejudices": ["prejudice", "judge", "preconceive", "stigma", "assumption", "assume", "misunderstanding", "unexamined", "distorted", "clear", "compar"],
    "Self-knowledge": ["self-knowledge", "self-awareness", "introspection", "examined", "myself", "realization", "belief"],
    "Similarities": ["similarity", "same", "similar", "equal", "related", "together"],
    "Diversity": ["diverse", "different", "diverse", "particular", "range", "multiplicity"],
    "Business school": ["ESADE", "competitive", "business school", "education", "study", "university", "student", "consulting", "professional", "pressure", "performance", "institution"],
    "Courage": ["courage", "brave", "dare", "step", "determine"],
    "Change": ["change", "finally", "at last", "decided", "chose", "concluded", "want to", "swap", "different", "not the same", "replace", "convert", "trade", "future", "decision"],
    "Coherence": ["coherent", "align", "incoherent", "consistent"],
    "Voicing": ["speak", "express", "voice", "talk", "say", "open up", "articulate", "communicate", "convey", "reveal", "show", "verbalize", "phrase", "word"],
    "Listening": ["listen", "pay attention", "quiet", "silence", "process", "hear", "attend"],
    "Understanding": ["learn", "understand", "realize", "see", "believe", "question", "critical", "thought", "reasonable", "logical", "rational", "comprehensible", "accept"],
    "Relationships": ["relationship", "relate", "bond", "connection", "bond", "others", "appreciate", "appreciation", "recognize", "recognition", "acknowledge"],
    "Emotions": ["emotions", "felt", "feel", "a feeling of", "sense", "sensation", "instinct", "sentiment", "gut feeling", "intense", "wave"],
    "The course": ["first time", "never", "always", "course", "elective", "Socratic Dialogue", "dialogue", "debate", "enroll", "arguments"],
}

CATEGORIES = CAT_TO_CODEWORDS.keys()

def retrieve_lines(filename):
    extension = filename.split(".")[-1]

    if extension == "pdf":
        text = extract_text(filename)
        lines = text.split("\n")
    elif extension in ["docx", "doc"]:
        with tempfile.TemporaryDirectory() as tmpdirname:
            outfile = os.path.join(tmpdirname, "temp.txt")
            pypandoc.convert_file(filename, 'plain', outputfile=outfile)
            with open(outfile, "r") as f:
                lines = f.readlines()
        
        lines = [l.strip() for l in lines]

    lines = " ".join(lines)
    lines = lines.split(".")

    return lines

def match_code(lines, codewords):
    match_dict = {}
    keywords_to_match = re.compile(fr'\b(?:{"|".join(codewords)})\b')
    for i, _ in enumerate(lines):
        line = lines[i]
        matches = list(keywords_to_match.finditer(line))

        if len(matches) > 0:
            for m in matches:
                span = m.span()
                line = line[:span[0]] + line[span[0]:span[1]].upper() + line[span[1]:]

            match_dict[i] = " ".join(line.rstrip().lstrip().split())

    return match_dict

def main(filename, codewords_mapping):
    lines = retrieve_lines(filename)
    files = []

    for label, codewords in codewords_mapping.items():
        match = match_code(lines, codewords)
        
        out = ""
        if len(match) > 0:
            result_file = ".".join(['_'.join(label.split()), "result", "txt"])
            result_file = os.path.join(RESULTS_FOLDER, result_file)
            if not os.path.exists(result_file):
                out += f"# Code: {label}\n"
                out += 25 * "="
                out += "\n\n"
    
            out += f"## Source: {'/'.join(filename.split('/')[-2:])}\n"
            out += 25 * "-"
            out += "\n"
            out += "\n".join([f'-{v}' for k,v in match.items()])
            out += "\n"
            out += 25 * "-"
            out += "\n\n"

            with open(result_file, "a") as f:
                f.write(out)

        files.append(result_file)

    return files


def convert(*keywords):
    codewords_mapping = {k: v for k,v in zip(CATEGORIES, keywords)}

    num_files = 0

    shutil.rmtree(RESULTS_FOLDER, ignore_errors=True)
    os.makedirs(RESULTS_FOLDER)

    result_files = []
    folders = glob.glob(os.path.join(DOC_FOLDER, "*"))


    for folder in tqdm.tqdm(folders):
        all_files = tqdm.tqdm(glob.glob(f"{folder}/*"))
        num_files += len(all_files)

        for filename in all_files:
            try:
                result_files += main(filename, codewords_mapping)
            except Exception as e:
                print(f"{filename} not working because \n {e}")

    result_files = list(set(result_files))

    api.upload_folder(
        repo_id="patrickvonplaten/atlas",
        folder_path=RESULTS_FOLDER,
        path_in_repo=f"results_{time.time()}",
        repo_type="dataset",
        token=HF_TOKEN,
    )

    return f"Done. Processed {num_files} files."


inputs = [gr.Textbox(label=f"Enter your keywords for {k}", max_lines=2, placeholder=CAT_TO_CODEWORDS[k], value=",".join(CAT_TO_CODEWORDS[k])) for k in CATEGORIES]

iface = gr.Interface(
        fn=convert, inputs=inputs, outputs="text")
iface.launch()