Spaces:
Runtime error
Runtime error
import os | |
import glob | |
import time | |
import numpy as np | |
from os import path | |
import pandas as pd | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
from sklearn.cluster import DBSCAN | |
from sklearn.preprocessing import StandardScaler | |
import arrange as DbscanArrange | |
import directories as Dir | |
""" | |
์ด์ ํ๋ก์ธ์ค: | |
image detection์ ํตํด handwritten text image์์ | |
๊ฐ๊ฐ์ word image๋ฅผ ์ถ์ถํ์ผ๋, | |
์์๋ฅผ ์์ง ๋ชปํ๊ฒ ๋จ์ด๋ง ํ์ธ์ด ๋์ด | |
์๋ ๊ธ์ ์๋ฏธ๋ฅผ ์์ด๋ฒ๋ฆฌ๋ ๋ฌธ์ ๊ฐ ์์์. | |
๋ค๋ง ์ถ์ถ ์์ ๋จ์ด์ ์ขํ๊ฐ์ ํฌํจ์ํฌ ์ ์์๊ธฐ์ | |
๊ฐ ๋จ์ด์ ์ด๋ฆ์ x,y์ขํ๊ฐ๊ณผ w,h ์ ๋ณด๋ฅผ ํฌํจํ์ฌ | |
๋จ์ด ์ด๋ฏธ์ง ํ์ผ์ ์์ฑ | |
ํ์ฌ ํ๋ก์ธ์ค: | |
yolo model์ ํตํด crop๋ ์ด๋ฏธ์ง๋ค์ | |
ํ์ผ ์ด๋ฆ์ crop๋๊ธฐ ์ raw image์์์ ์ขํ ์ ๋ณด๋ฅผ ํฌํจํฉ๋๋ค. | |
x89y147w199h184.jpg | |
x10y148w157h184.jpg | |
x28y149w108h180.jpg | |
raw imageํ์ผ์์ ๊ฐ์ line์ ์๋ word๋ค์ | |
๊ฐ๊น์ด y๊ฐ์ ๊ฐ๊ณ ์์ ๊ฒ์ด๋ฏ๋ก, | |
๋ฐ๋๊ธฐ๋ฐ ๊ตฐ์งํ ์๊ณ ๋ฆฌ์ฆ์ ์ฌ์ฉํ์ฌ | |
๊ฐ์ line์ ์์ ๊ฒ์ด๋ผ๊ณ ์์ํ๋ word๊ฐ๋ค์ | |
๋์ผ ํด๋ฌ์คํฐ์ ์ํ๊ฒ ๋จ. | |
์ด๋ ๊ฒ ๋์ผ ํด๋ฌ์คํฐ์ ์ํ๋ y๊ฐ๋ค์ ํ๊ท ์ ๊ตฌํ์ฌ | |
word์ ์๋ก์ด y๊ฐ์ผ๋ก label. | |
๋จ์ด ์ด๋ฏธ์ง ํ์ผ renameํ ๋, y๊ฐ์ด ๋จผ์ ์ค๊ฒ ํ๋ค. | |
yyyy_xxxx.jpg | |
y๊ฐ์ ๋ํ ์ ๋ ฌ์ด ๋๊ณ | |
x๊ฐ์ ๋ํด ์ค๋ฆ์ฐจ์์ผ๋ก ์ ๋ ฌ๋จ. | |
0148_0010.jpg | |
0148_0028.jpg | |
0148_0089.jpg | |
word file์ ์์๊ฐ ์๋์ ์ผ๋ก ์๋ text์ ์๋ฏธ๋ฅผ ๊ฐ์ง๊ฒ ๋๋ค. | |
์ฝ๋ ์ค๋ช : | |
extract_text_from_filename(), | |
get_folder_contents_with_text() ํจ์๋ฅผ ์ด์ฉํด | |
ํ์ผ ์ด๋ฆ์์ x๊ฐ๊ณผ y๊ฐ์ ์ถ์ถ. | |
StandardScaler๋ก ํ์คํ. | |
Get Clustered y values using DBSCAN. | |
rename_file()์ ์ด์ฉํด์ rename image files with y-clustered values. | |
""" | |
# ์ฌ์ฉ ์์ | |
# Get cropped word images | |
# folder_path = "C:/Users/ban/TEXTAI/yolov5/runs/detect/yujin_paper/crops/word" -> cropped word folder | |
x_texts, y_texts, name_jpg = DbscanArrange.get_folder_contents_with_text(Dir.folder_path) | |
file_name = pd.DataFrame(name_jpg) | |
file_name.columns=['file_name'] | |
#๋๋ฏธ ๋ฆฌ์คํธ ์์ฑ, ๋ํ์ด ๋ณํ ํ reshape (-1, 1) | |
zero_list = [0 for _ in range(len(y_texts))] #2์ฐจ์ ๋ง์ถ๊ธฐ ์ํ zero ๋ฆฌ์คํธ | |
zero_list = np.array([zero_list]).reshape(-1, 1) | |
y_text = np.array([y_texts]).reshape(-1, 1) | |
#print('y_text\n', y_text) | |
#print('zero_list\n', zero_list) | |
##################################### | |
# ํ์คํ | |
scalerX = StandardScaler() # ์ค์ผ์ผ ํจ์ ๊ฐ์ ธ์์ | |
scalerX.fit(y_text.data) # ์ค์ผ์ผ | |
std_y_text = scalerX.transform(y_text.data).reshape(-1, 1) #์ค์ผ์ผ ์๋ฃ | |
feature = pd.DataFrame(std_y_text) # ์ค์ผ์ผ๋ ๋ํ์ด ์๋ฃํ์ ๋ฐ์ดํฐํ๋ ์ ํํ๋ก ๋ณํ | |
feature.columns=['feature'] | |
data_list = [std_y_text,zero_list] # ํ์คํ๋ ๋ฐ์ดํฐ์ ๋๋ฏธ ์ฝ์ | |
data = pd.DataFrame(data_list[0]) # | |
labels = pd.DataFrame(data_list[1]) | |
# ๋ฐ์ดํฐ ์ปฌ๋ผ๋ช ์ค์ | |
labels.columns=['labels'] | |
data.columns=['y'] | |
# ๋ ๋ฐ์ดํฐ์ด ๋ณํฉ | |
datadf = pd.concat([data,labels],axis=1) | |
################################### | |
# create model and prediction | |
model = DBSCAN(eps=0.04,min_samples=2) | |
predict = pd.DataFrame(model.fit_predict(feature)) | |
predict.columns=['predict'] | |
# file_name, feature, predict ๋ณํฉ | |
r = pd.concat([file_name,feature,predict],axis=1) | |
#r.to_csv('C:\\Users\\ban\\Desktop\\predict_final.csv') | |
########################################################### | |
########################################################### | |
r = r.sort_values(by=['predict']) | |
#print(type(set(r['predict']))[0]) | |
predict_list = list(set(r['predict'])) # predict ์ ์ซ์๋ค์ ์์ง | |
unknown_words = [] | |
same_line = [] | |
whole_word_map = [] | |
final_result = {'y_mean' : [], | |
'x_value' : [], | |
'file_name' : []} | |
df_final_result = pd.DataFrame(final_result) | |
whole_word_map_df = pd.DataFrame(whole_word_map) | |
for _,line in enumerate(predict_list): # ํ๋์ฉ ์ถ์ถ -1, 0, 1, 2, ...์ง์ง ํ๋ค๋ค | |
if line >= 0: # predict ๊ฐ์ด 0 ์ด์์ด๋ฉด, | |
y_list = [] # ํด๋ฌ์คํ ๋ y๊ฐ๋ค์ ํ๊ท ์ ๊ตฌํ๊ธฐ ์ํ ๋ฆฌ์คํธ | |
print(type(r['predict'])) | |
same_line = r[r['predict'] == line] # r ๋ฐ์ดํฐํ๋ ์์์ X์ธ predict๋ฅผ ๊ฐ์ง๊ณ ์๋ ์ด์ ๊ฐ์ ธ์์ | |
file_num = 0 | |
y_mean_column = [] | |
total_word_map = [] | |
total_word_map_df = pd.DataFrame(total_word_map) | |
for filename in same_line['file_name']: # ๊ฐ์ ํด๋ฌ์คํฐ์์ ํ์ผ ํ๋์ฉ ๋ฝ์์ | |
x_data, y_data = DbscanArrange.extract_text_from_filename(filename) # ํด๋น ํ์ผ์ x, y๊ฐ์ ๋ฝ์์ | |
y_list.append(int(y_data)) # y๊ฐ ํ๊ท ์ ์ํ y๊ฐ ๋ฆฌ์คํธ์ ์ฝ์ | |
#x_file = {x_data:filename} # key๊ฐ์ x๊ฐ, value๊ฐ์ file_name, n_line ๋ฐ์ดํฐํ๋ ์์ ์ฝ์ | |
file_num += 1 | |
word_map = {'x_value' : [int(x_data)], | |
'file_name' : [filename]} | |
word_map_df = pd.DataFrame(word_map) | |
total_word_map_df = pd.concat([total_word_map_df, word_map_df]) | |
total_word_map_df = total_word_map_df.sort_values(by=['x_value']) | |
y_mean = int(np.mean(y_list)) # ํ ์ค์ ๋ํ ํ๊ท ๊ฐ ์ป์ #y_mean ์ด์ ์ฝ์ | |
total_word_map_df['y_mean'] = y_mean # ์ถ๊ฐ๋ ํ ์๋งํผ y_mean ์ด ์ถ๊ฐ | |
else: | |
total_word_map = [] | |
same_line = r[r['predict'] == line] | |
for filename in same_line['file_name']: | |
x_data, y_data = DbscanArrange.extract_text_from_filename(filename) | |
unknown_words = [r['predict']==line] | |
y_mean = int(y_data) | |
word_map = {'y_mean' : [y_mean], | |
'x_value' : [x_data], | |
'file_name' : [filename]} | |
word_map_df = pd.DataFrame(word_map) | |
total_word_map_df = pd.concat([total_word_map_df, word_map_df]) | |
whole_word_map_df = pd.concat([whole_word_map_df,total_word_map_df]) | |
file_name = list(whole_word_map_df['file_name']) | |
x_value = list(whole_word_map_df['x_value']) | |
y_mean = list(whole_word_map_df['y_mean']) | |
whole_list = [file_name,x_value,y_mean] | |
cnum = 0 | |
timestr = time.strftime("%Y%m%d%H%M%S") | |
#์ ์ฒด ๋ฐ์ดํฐ ์ ์ ๋ํด์ ํ๋์ฉ ์ถ์ถํ์ฌ ํจ๋ฉ ํ ์ด๋ฆ ๋ณํ | |
for i in range(len(file_name)): #_, x_value, file_name, y_mean | |
old_path = str(Dir.folder_path) +"/"+ str(file_name[i]) | |
new_path = str(Dir.folder_path) +"/"+ str(y_mean[i]).zfill(4) +"_"+ str(x_value[i]).zfill(4) + ".jpg" | |
DbscanArrange.rename_file(old_path, new_path) | |
if os.listdir(Dir.folder_path) == True: | |
folder_contents = os.listdir(Dir.folder_path) | |
print('UNKNOWN WORDS: \n',folder_contents) | |