import pandas as pd KIFU_TO_SQUARE_NAMES = [ '1一', '1二', '1三', '1四', '1五', '1六', '1七', '1八', '1九', '2一', '2二', '2三', '2四', '2五', '2六', '2七', '2八', '2九', '3一', '3二', '3三', '3四', '3五', '3六', '3七', '3八', '3九', '4一', '4二', '4三', '4四', '4五', '4六', '4七', '4八', '4九', '5一', '5二', '5三', '5四', '5五', '5六', '5七', '5八', '5九', '6一', '6二', '6三', '6四', '6五', '6六', '6七', '6八', '6九', '7一', '7二', '7三', '7四', '7五', '7六', '7七', '7八', '7九', '8一', '8二', '8三', '8四', '8五', '8六', '8七', '8八', '8九', '9一', '9二', '9三', '9四', '9五', '9六', '9七', '9八', '9九', ] KIFU_FROM_SQUARE_NAMES = [ '11', '12', '13', '14', '15', '16', '17', '18', '19', '21', '22', '23', '24', '25', '26', '27', '28', '29', '31', '32', '33', '34', '35', '36', '37', '38', '39', '41', '42', '43', '44', '45', '46', '47', '48', '49', '51', '52', '53', '54', '55', '56', '57', '58', '59', '61', '62', '63', '64', '65', '66', '67', '68', '69', '71', '72', '73', '74', '75', '76', '77', '78', '79', '81', '82', '83', '84', '85', '86', '87', '88', '89', '91', '92', '93', '94', '95', '96', '97', '98', '99', ] def nomalize_precedence_name(df): #先手の対局者の名前から段位、タイトル名を削除する for x in range(len(df)): df["precedence_name"].iloc[x] = df["precedence_name"].iloc[x].replace(" ","").replace(" ","").replace("\u3000","") if df["precedence_name"].iloc[x].endswith("段"): df["precedence_name"].iloc[x] = df["precedence_name"].iloc[x][:-2] df["precedence_name"].iloc[x] = df["precedence_name"].iloc[x].replace("十七世名人","").replace("十八世名人","").replace("十九世名人","") df["precedence_name"].iloc[x] = df["precedence_name"].iloc[x].replace("王将","").replace("王座","").replace("名人","").replace("竜王","").replace("棋聖","").replace("叡王","").replace("王位","").replace("棋王","") df["precedence_name"].iloc[x] = df["precedence_name"].iloc[x].replace("・","").replace("二冠","").replace("三冠","") return df def nomalize_kif(df): for x in range(len(df)): kif = eval(df.iloc[x]["kif"]) #kifの正規化処理 手数、消費時間を削除する cnt = -1 for y in kif: cnt += 1 while(1): if "0" <= y[0] <= "9": y = y[1:] kif[cnt] = y else: break kif[cnt] = kif[cnt].replace("\u3000","") for z in range(len(y)): if y[z] == "(": kif[cnt] = y[:z] break kifs = "" for i in kif: kifs += i.replace("\u3000","") df["kif"].iloc[x] = kifs return df def nomalize_comment(df): #文章中のword省略処理 for cnt in range(len(df["output"])): x = df["output"].iloc[cnt] read = x.split("。") #print(read) line = "" for z in read: if "期" in z or "出身" in z or "優勝" in z or "受賞" in z or "回" in z or "記録" in z or "棋士番号" in z or "勝" in z or "敗" in z or "名人" in z: pass elif "時" in z or "分" in z or "成績" in z or "棋戦" in z or "段" in z or "本日" in z or "立会" in z or "ABEMA" in z or "第" in z or "本局" in z: pass elif "対局" in z or "永世" in z: pass elif z == "": pass else: #print(z) line += z+"。" df["output"].iloc[cnt] = line return df def accuracy_bestlist(df): cnt2 = 0 num = 0 for z in range(len(df)): blist = eval(df["bestlist"].iloc[z]) b2list = eval(df["best2list"].iloc[z]) te = eval(df["kif"].iloc[z]) #print(blist[0][0]) #print(b2list[0][0]) cnt = 0 for x in range(1,len(te)): try: if blist[x-1][0] in te[x] or b2list[x-1][0] in te[x]: cnt += 1 #print(te[x],blist[x][0],b2list[x][0]) except Exception as e: pass if cnt == 0: print("accuracy = 0",z) print("z = ",z," accuracy = ",cnt/len(te)) cnt2 += cnt/len(te) num += 1 print("mean_acuuracy",cnt2/num) def nomalize_sfen(s): flag = 0 movelist = [] for x in range(len(s)): if x < 2: continue if len(s[x]) < 30 and flag == 0: #半角の指し手を全角に変換する temp = s[x].split() num = temp[1][0] + temp[1][1] for y in range(len(KIFU_FROM_SQUARE_NAMES)): if num == KIFU_FROM_SQUARE_NAMES[y]: sq = KIFU_TO_SQUARE_NAMES[y] word = sq+temp[1][2:] word = word.replace("竜","龍").replace("成銀","全").replace("成桂","圭").replace("成香","杏") if s[x].split()[1] not in ["投了" , "千日手" , "持将棋" , "反則勝ち"]: movelist.append(word) else: movelist.append(s[x].split()[1]) flag = 1 return movelist def make_triplets(df, column): # 重複を除いたユニークな文章リストを作成 triplets = [] for x in range(len(df)): anchor = df.iloc[x] # Anchorと同じではない文章をPositiveとして選択 num = df.loc[(df[column] == anchor[column]) & (df["kif"] != anchor["kif"])].sample(n=1).index # print(df.loc[num]) positive = df.loc[num]["kif"].values[0] # Anchorと異なる文章をNegativeとして選択 num2 = df.loc[(df[column] != anchor[column]) & (df["kif"] != anchor["kif"])].sample(n=1).index # print(df.loc[num2]) negative = df.loc[num2]["kif"].values[0] triplets.append((anchor["kif"], positive, negative,df.loc[num][column].values[0],df.loc[num2][column].values[0])) def add_symbol(df,column): teban ="▲" kif = "" for x in range(len(df)): for y in df[column].iloc[x]: if y in ["0","1","2","3","4","5","6","7","8","9","同",0,1,2,3,4,5,6,7,8,9]: kif += teban + y if teban =="▲": teban = "△" else: teban = "▲" else: kif += y df[column].iloc[x] = kif kif = "" return df