File size: 3,182 Bytes
ad7e448
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os
import pandas as pd
from transformers import CLIPTokenizer
import tkinter as tk
from tkinter import filedialog, scrolledtext
import threading

# CLIPTokenizer ์ดˆ๊ธฐํ™”
s_token = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

def rcs(text):
    if text is None:
        return None
    token_ids = s_token.encode(text)
    return len(token_ids)

def process_files(dflist, text_box, select_button, process_button):
    # /tags ํ•˜์œ„ ํด๋” ๋งŒ๋“ค๊ธฐ
    output_dir = "processed"
    os.makedirs(output_dir, exist_ok=True)

    for i, _df in enumerate(dflist):
        # parquet ํŒŒ์ผ ์ฝ๊ธฐ
        df = pd.read_parquet(_df, engine="pyarrow")

        # 'tokens' ์—ด ์ถ”๊ฐ€
        tokens = []
        total = len(df)
        for idx, text in enumerate(df['general']):
            if text is not None:
                tokens.append(rcs(text))

                # ์ง„ํ–‰ ์ƒํ™ฉ ์ถœ๋ ฅ
                if (idx + 1) % 100 == 0 or idx + 1 == total:
                    progress = f"Processing file {_df}: {idx + 1}/{total} ({(idx + 1) / total * 100:.2f}%)\n"
                    text_box.insert(tk.END, progress)
                    text_box.see(tk.END)
            else:
                tokens.append(None)

        df['tokens'] = tokens

        # ์ฒ˜๋ฆฌ๋œ ํŒŒ์ผ ์ €์žฅ ๊ฒฝ๋กœ
        output_path = os.path.join(output_dir, os.path.basename(_df))

        # parquet ํŒŒ์ผ๋กœ ์ €์žฅ
        df.to_parquet(output_path, engine="pyarrow")
        text_box.insert(tk.END, f"Finished processing {_df}\n")
        text_box.see(tk.END)

    text_box.insert(tk.END, "๋ชจ๋“  ํŒŒ์ผ์ด ์„ฑ๊ณต์ ์œผ๋กœ ์ฒ˜๋ฆฌ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.\n")
    text_box.see(tk.END)
    
    # ์ž‘์—…์ด ๋ชจ๋‘ ์ข…๋ฃŒ๋˜๋ฉด output_dir ์œˆ๋„์šฐ ํด๋”๊ฐ€ ์—ด๋ฆฐ๋‹ค.
    os.startfile(output_dir)

    # dflist ์ดˆ๊ธฐํ™”
    dflist = []

    # ๋ฒ„ํŠผ ๋‹ค์‹œ ํ™œ์„ฑํ™”
    select_button.config(state=tk.NORMAL)
    process_button.config(state=tk.NORMAL)

def select_files():
    file_paths = filedialog.askopenfilenames(filetypes=[("Parquet files", "*.parquet")])
    if file_paths:
        dflist.extend(file_paths)
        text_box.insert(tk.END, f"Selected files:\n{file_paths}\n")
        text_box.see(tk.END)

def start_processing():
    if not dflist:
        return
    select_button.config(state=tk.DISABLED)
    process_button.config(state=tk.DISABLED)

    # ํŒŒ์ผ ์ฒ˜๋ฆฌ ์Šค๋ ˆ๋“œ ์‹œ์ž‘
    threading.Thread(target=process_files, args=(dflist, text_box, select_button, process_button)).start()

# Tkinter UI ์„ค์ •
root = tk.Tk()
root.title("ํ”„๋กฌํ”„ํŠธ ์Šคํƒœ์ปค์šฉ parquet ํŒŒ์ผ ํ† ํฐ ์—…๋ฐ์ดํŠธ ๋„๊ตฌ")

frame = tk.Frame(root)
frame.pack(padx=10, pady=10)

select_button = tk.Button(frame, text="Parquet ํŒŒ์ผ ์„ ํƒ", command=select_files)
select_button.pack(side=tk.LEFT, padx=5, pady=5)

process_button = tk.Button(frame, text="ํ† ํฐ ๊ณ„์‚ฐ ์‹œ์ž‘", command=start_processing)
process_button.pack(side=tk.LEFT, padx=5, pady=5)

text_box = scrolledtext.ScrolledText(root, width=80, height=20)
text_box.pack(padx=10, pady=10)

# dflist ์ดˆ๊ธฐํ™”
dflist = []

root.mainloop()