|
import os
|
|
import pandas as pd
|
|
from transformers import CLIPTokenizer
|
|
import tkinter as tk
|
|
from tkinter import filedialog, scrolledtext
|
|
import threading
|
|
|
|
|
|
s_token = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
|
|
|
|
def rcs(text):
|
|
if text is None:
|
|
return None
|
|
token_ids = s_token.encode(text)
|
|
return len(token_ids)
|
|
|
|
def process_files(dflist, text_box, select_button, process_button):
|
|
|
|
output_dir = "processed"
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
for i, _df in enumerate(dflist):
|
|
|
|
df = pd.read_parquet(_df, engine="pyarrow")
|
|
|
|
|
|
tokens = []
|
|
total = len(df)
|
|
for idx, text in enumerate(df['general']):
|
|
if text is not None:
|
|
tokens.append(rcs(text))
|
|
|
|
|
|
if (idx + 1) % 100 == 0 or idx + 1 == total:
|
|
progress = f"Processing file {_df}: {idx + 1}/{total} ({(idx + 1) / total * 100:.2f}%)\n"
|
|
text_box.insert(tk.END, progress)
|
|
text_box.see(tk.END)
|
|
else:
|
|
tokens.append(None)
|
|
|
|
df['tokens'] = tokens
|
|
|
|
|
|
output_path = os.path.join(output_dir, os.path.basename(_df))
|
|
|
|
|
|
df.to_parquet(output_path, engine="pyarrow")
|
|
text_box.insert(tk.END, f"Finished processing {_df}\n")
|
|
text_box.see(tk.END)
|
|
|
|
text_box.insert(tk.END, "λͺ¨λ νμΌμ΄ μ±κ³΅μ μΌλ‘ μ²λ¦¬λμμ΅λλ€.\n")
|
|
text_box.see(tk.END)
|
|
|
|
|
|
os.startfile(output_dir)
|
|
|
|
|
|
dflist = []
|
|
|
|
|
|
select_button.config(state=tk.NORMAL)
|
|
process_button.config(state=tk.NORMAL)
|
|
|
|
def select_files():
|
|
file_paths = filedialog.askopenfilenames(filetypes=[("Parquet files", "*.parquet")])
|
|
if file_paths:
|
|
dflist.extend(file_paths)
|
|
text_box.insert(tk.END, f"Selected files:\n{file_paths}\n")
|
|
text_box.see(tk.END)
|
|
|
|
def start_processing():
|
|
if not dflist:
|
|
return
|
|
select_button.config(state=tk.DISABLED)
|
|
process_button.config(state=tk.DISABLED)
|
|
|
|
|
|
threading.Thread(target=process_files, args=(dflist, text_box, select_button, process_button)).start()
|
|
|
|
|
|
root = tk.Tk()
|
|
root.title("ν둬ννΈ μ€νμ»€μ© parquet νμΌ ν ν° μ
λ°μ΄νΈ λꡬ")
|
|
|
|
frame = tk.Frame(root)
|
|
frame.pack(padx=10, pady=10)
|
|
|
|
select_button = tk.Button(frame, text="Parquet νμΌ μ ν", command=select_files)
|
|
select_button.pack(side=tk.LEFT, padx=5, pady=5)
|
|
|
|
process_button = tk.Button(frame, text="ν ν° κ³μ° μμ", command=start_processing)
|
|
process_button.pack(side=tk.LEFT, padx=5, pady=5)
|
|
|
|
text_box = scrolledtext.ScrolledText(root, width=80, height=20)
|
|
text_box.pack(padx=10, pady=10)
|
|
|
|
|
|
dflist = []
|
|
|
|
root.mainloop()
|
|
|