Spaces:

passaglia
/

yomikata-demo

Build error

App Files Files Community

Sam Passaglia commited on Feb 20, 2023

Commit

f29cbf8

•

1 Parent(s): ac462f6

minor

Browse files

Files changed (22) hide show

config/heteronyms_Sato2022.json +0 -211
yomikata/dataset/__init__.py +0 -0
yomikata/dataset/__pycache__/__init__.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/aozora.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/bccwj.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/kwdlc.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/ndlbib.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/pronunciations.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/repair_long_vowels.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/split.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/sudachi.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/unidic.cpython-310.pyc +0 -0
yomikata/dataset/aozora.py +0 -117
yomikata/dataset/bccwj.py +0 -206
yomikata/dataset/kwdlc.py +0 -109
yomikata/dataset/ndlbib.py +0 -46
yomikata/dataset/pronunciations.py +0 -57
yomikata/dataset/repair_long_vowels.py +0 -62
yomikata/dataset/split.py +0 -271
yomikata/dataset/sudachi.py +0 -50
yomikata/dataset/unidic.py +0 -44
yomikata/main.py +0 -123

config/heteronyms_Sato2022.json DELETED Viewed

@@ -1,211 +0,0 @@
-{
-    "heteronyms_in_bert": {
-        "表": 2,
-        "角": 4,
-        "大分": 2,
-        "国立": 2,
-        "人気": 3,
-        "市場": 2,
-        "気質": 2,
-        "役所": 2,
-        "上方": 2,
-        "上手": 3,
-        "下手": 3,
-        "人事": 2,
-        "金星": 2,
-        "仮名": 2,
-        "内面": 2,
-        "礼拝": 2,
-        "遺言": 3,
-        "口腔": 2,
-        "後世": 2,
-        "骨": 2,
-        "一途": 2,
-        "一言": 3,
-        "最中": 3,
-        "一目": 2,
-        "係": 3,
-        "足跡": 2,
-        "今日": 2,
-        "明日": 3,
-        "生物": 3,
-        "変化": 2,
-        "大事": 2,
-        "水車": 2,
-        "一見": 2,
-        "一端": 2,
-        "大家": 3,
-        "心中": 2,
-        "書物": 2,
-        "一角": 2,
-        "一行": 3,
-        "一時": 3,
-        "一定": 2,
-        "一方": 2,
-        "一夜": 2,
-        "下野": 3,
-        "化学": 2,
-        "火口": 2,
-        "花弁": 2,
-        "玩具": 2,
-        "強力": 3,
-        "金色": 2,
-        "経緯": 2,
-        "故郷": 2,
-        "紅葉": 2,
-        "行方": 3,
-        "根本": 2,
-        "左右": 3,
-        "山陰": 2,
-        "十分": 2,
-        "上下": 5,
-        "身体": 2,
-        "水面": 2,
-        "世論": 2,
-        "清水": 3,
-        "大手": 2,
-        "大人": 4,
-        "大勢": 3,
-        "中間": 5,
-        "日向": 42,
-        "日時": 3,
-        "夫婦": 2,
-        "牧場": 2,
-        "末期": 2,
-        "利益": 2,
-        "工夫": 2,
-        "一味": 2,
-        "魚": 3,
-        "区分": 2,
-        "施行": 4,
-        "施工": 2,
-        "転生": 2,
-        "博士": 2,
-        "法華": 2,
-        "真面目": 3,
-        "眼鏡": 2,
-        "文字": 2,
-        "文書": 3,
-        "律令": 2,
-        "現世": 2,
-        "日中": 2,
-        "夜中": 3,
-        "前世": 2,
-        "二人": 2,
-        "立像": 2
-    },
-    "heteronyms_not_in_bert": {
-        "教化": 3,
-        "見物": 2,
-        "清浄": 2,
-        "谷間": 2,
-        "追従": 2,
-        "墓石": 2,
-        "大文字": 2,
-        "漢書": 2,
-        "作法": 2,
-        "兵法": 2,
-        "大人気": 2,
-        "半月": 2,
-        "黒子": 2,
-        "外面": 2,
-        "競売": 2,
-        "開眼": 2,
-        "求道": 2,
-        "血脈": 2,
-        "施業": 2,
-        "借家": 2,
-        "頭蓋骨": 2,
-        "法衣": 2,
-        "昨日": 2,
-        "氷柱": 2,
-        "風車": 2,
-        "寒気": 2,
-        "背筋": 2,
-        "逆手": 2,
-        "色紙": 2,
-        "生花": 3,
-        "白髪": 2,
-        "貼付": 2,
-        "一回": 2,
-        "一期": 2,
-        "一月": 3,
-        "一所": 2,
-        "一寸": 2,
-        "一声": 2,
-        "一石": 2,
-        "一日": 4,
-        "一分": 3,
-        "一文": 3,
-        "一片": 3,
-        "何時": 3,
-        "何分": 2,
-        "火煙": 2,
-        "火傷": 2,
-        "火床": 3,
-        "火先": 2,
-        "火筒": 2,
-        "芥子": 3,
-        "気骨": 2,
-        "銀杏": 3,
-        "元金": 2,
-        "五分": 2,
-        "後々": 2,
-        "後生": 2,
-        "御供": 4,
-        "細々": 3,
-        "細目": 2,
-        "三位": 2,
-        "疾風": 3,
-        "菖蒲": 2,
-        "世人": 2,
-        "世路": 2,
-        "船底": 2,
-        "早急": 2,
-        "相乗": 2,
-        "造作": 2,
-        "他言": 2,
-        "東雲": 2,
-        "頭数": 2,
-        "二重": 2,
-        "日供": 2,
-        "日次": 4,
-        "日暮": 3,
-        "日来": 3,
-        "梅雨": 2,
-        "風穴": 2,
-        "仏語": 3,
-        "分別": 2,
-        "面子": 2,
-        "木目": 2,
-        "目下": 2,
-        "夜直": 2,
-        "夜来": 2,
-        "夜話": 2,
-        "野兎": 2,
-        "野馬": 3,
-        "野分": 2,
-        "野辺": 2,
-        "野面": 3,
-        "野立": 3,
-        "冷水": 2,
-        "連中": 2,
-        "飛沫": 2,
-        "翡翠": 2,
-        "餃子": 2,
-        "一足": 2,
-        "意気地": 2,
-        "一昨日": 3,
-        "一昨年": 2,
-        "十八番": 2,
-        "十六夜": 2,
-        "明後日": 2,
-        "石綿": 2,
-        "公文": 2,
-        "読本": 3,
-        "仏国": 3,
-        "古本": 2,
-        "町家": 2,
-        "遊行": 2
-    }
-}

yomikata/dataset/__init__.py DELETED Viewed

File without changes

yomikata/dataset/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (163 Bytes)

yomikata/dataset/__pycache__/aozora.cpython-310.pyc DELETED Viewed

Binary file (2.99 kB)

yomikata/dataset/__pycache__/bccwj.cpython-310.pyc DELETED Viewed

Binary file (5.31 kB)

yomikata/dataset/__pycache__/kwdlc.cpython-310.pyc DELETED Viewed

Binary file (2.47 kB)

yomikata/dataset/__pycache__/ndlbib.cpython-310.pyc DELETED Viewed

Binary file (1.3 kB)

yomikata/dataset/__pycache__/pronunciations.cpython-310.pyc DELETED Viewed

Binary file (1.44 kB)

yomikata/dataset/__pycache__/repair_long_vowels.cpython-310.pyc DELETED Viewed

Binary file (2.13 kB)

yomikata/dataset/__pycache__/split.cpython-310.pyc DELETED Viewed

Binary file (8.08 kB)

yomikata/dataset/__pycache__/sudachi.cpython-310.pyc DELETED Viewed

Binary file (1.15 kB)

yomikata/dataset/__pycache__/unidic.cpython-310.pyc DELETED Viewed

Binary file (1.27 kB)

yomikata/dataset/aozora.py DELETED Viewed

@@ -1,117 +0,0 @@
-"""aozora.py
-Data processing script for aozora bunko file from https://github.com/ndl-lab/huriganacorpus-aozora
-"""
-import warnings
-from pathlib import Path
-import pandas as pd
-from pandas.errors import ParserError
-from speach import ttlig
-from config import config
-from config.config import logger
-from yomikata import utils
-from yomikata.dataset.repair_long_vowels import repair_long_vowels
-warnings.filterwarnings("ignore")
-def read_file(file: str):
-    # logger.info("reading file")
-    with open(file) as f:
-        rows = [
-            line.rstrip("\n").rstrip("\r").split("\t")[0:3] for line in f.readlines()
-        ]
-    df = pd.DataFrame(rows, columns=["word", "furigana", "type"])
-    # logger.info("removing unused rows")
-    # remove unused rows
-    df = df[~df["type"].isin(["[入力 読み]", "分かち書き"])]
-    df = df[~pd.isna(df["word"])]
-    df = df[~pd.isnull(df["word"])]
-    df = df[df["word"] != ""]
-    # logger.info("organizing into sentences")
-    # now organize remaining rows into sentences
-    gyou_df = pd.DataFrame(columns=["sentence", "furigana", "sentenceid"])
-    sentence = ""
-    furigana = ""
-    sentenceid = None
-    gyous = []
-    for row in df.itertuples():
-        if row.type in ["[入力文]"]:
-            sentence = row.word
-        elif row.type in ["漢字"]:
-            furigana += ttlig.RubyToken.from_furi(
-                row.word, repair_long_vowels(row.furigana, row.word)
-            ).to_code()
-        elif row.word.split(":")[0] in ["行番号"]:
-            if sentenceid:  # this handles the first row
-                gyous.append([sentence, furigana, sentenceid])
-            sentenceid = file.name + "_" + row.word.split(":")[1].strip()
-            sentence = None
-            furigana = ""
-        else:
-            furigana += row.word
-    # last row handling
-    gyous.append([sentence, furigana, sentenceid])
-    # make dataframe
-    gyou_df = pd.DataFrame(gyous, columns=["sentence", "furigana", "sentenceid"])
-    gyou_df = gyou_df[~pd.isna(gyou_df.sentence)]
-    # logger.info("cleaning rows")
-    # clean rows
-    gyou_df["furigana"] = gyou_df["furigana"].apply(utils.standardize_text)
-    gyou_df["sentence"] = gyou_df["sentence"].apply(
-        lambda s: utils.standardize_text(
-            s.replace("|", "").replace(" ", "").replace("※", "")
-        )
-    )
-    # logger.info("removing errors")
-    # remove non-matching rows
-    gyou_df = gyou_df[
-        gyou_df["sentence"] == gyou_df["furigana"].apply(utils.remove_furigana)
-    ]
-    # remove known errors
-    error_ids = []
-    gyou_df = gyou_df[~gyou_df["sentenceid"].isin(error_ids)]
-    # remove duplicates
-    gyou_df = gyou_df.drop_duplicates()
-    return gyou_df
-def aozora_data():
-    """Extract, load and transform the aozora data"""
-    # Extract sentences from the data files
-    files = list(Path(config.RAW_DATA_DIR, "aozora").glob("*/*/*.txt"))
-    with open(Path(config.SENTENCE_DATA_DIR, "aozora.csv"), "w") as f:
-        f.write("sentence,furigana,sentenceid\n")
-    for i, file in enumerate(files):
-        logger.info(f"{i+1}/{len(files)} {file.name}")
-        try:
-            df = read_file(file)
-        except ParserError:
-            logger.error(f"Parser error on {file}")
-        df.to_csv(
-            Path(config.SENTENCE_DATA_DIR, "aozora.csv"),
-            mode="a",
-            index=False,
-            header=False,
-        )
-    logger.info("✅ Saved all aozora data!")
-if __name__ == "__main__":
-    aozora_data()

yomikata/dataset/bccwj.py DELETED Viewed

@@ -1,206 +0,0 @@
-"""bccwj.py
-Data processing script for files downloaded from Chuunagon search
-Chuunagon URL: https://chunagon.ninjal.ac.jp/
-Download with the settings
-文脈中の区切り記号 |
-文脈中の文区切り記号 #
-前後文脈の語数 10
-検索対象（固定長・可変長） 両方
-共起条件の範囲 文境界をまたがない
-ダウンロードオプション
-システム Linux
-文字コード UTF-8
-改行コード LF
-出力ファイルが一つの場合は Zip 圧縮を行わない 検索条件式ごとに出力ファイルを分割する
-インラインタグを使用  CHECK BOTH 語彙素読み AND 発音形出現形語種 BOX
-(発音形出現形 is the actual pronounced one, but displays e.g. よう　れい　as よー　れー)
-タグの区切り記号 :
-"""
-import warnings
-from pathlib import Path
-import jaconv
-import pandas as pd
-from speach.ttlig import RubyToken
-from config import config
-from config.config import logger
-from yomikata import utils
-warnings.filterwarnings("ignore")
-SENTENCE_SPLIT_CHAR = "#"
-WORD_SPLIT_CHAR = "|"
-READING_SEP_CHAR = ":"
-def read_bccwj_file(filename: str):
-    """ """
-    df = pd.read_csv(filename, sep="\t")
-    df["前文脈"] = df["前文脈"].fillna("")
-    df["後文脈"] = df["後文脈"].fillna("")
-    df["full_text"] = (
-        df["前文脈"] + df["キー"] + "[" + df["語彙素読み"] + ":" + df["発音形出現形"] + "]" + df["後文脈"]
-    )
-    def get_sentences(row):
-        sentences = row["full_text"].split(SENTENCE_SPLIT_CHAR)
-        furigana_sentences = []
-        for sentence in sentences:
-            words_with_readings = sentence.split(WORD_SPLIT_CHAR)
-            furigana_sentence = ""
-            for word_with_reading in words_with_readings:
-                word = word_with_reading.split("[")[0]
-                form, reading = jaconv.kata2hira(
-                    word_with_reading.split("[")[1].split("]")[0]
-                ).split(READING_SEP_CHAR)
-                if (
-                    not utils.has_kanji(word)
-                    or reading == jaconv.kata2hira(word)
-                    or form == ""
-                    or reading == ""
-                ):
-                    furigana_sentence += word
-                else:
-                    if ("ー" in reading) and ("ー" not in form):
-                        indexes_of_dash = [
-                            pos for pos, char in enumerate(reading) if char == "ー"
-                        ]
-                        for index_of_dash in indexes_of_dash:
-                            if len(reading) == len(form):
-                                dash_reading = form[index_of_dash]
-                            else:
-                                char_before_dash = reading[index_of_dash - 1]
-                                if char_before_dash in "ねめせぜれてでけげへべぺ":
-                                    digraphA = char_before_dash + "え"
-                                    digraphB = char_before_dash + "い"
-                                    if digraphA in form and digraphB not in form:
-                                        dash_reading = "え"
-                                    elif digraphB in form and digraphA not in form:
-                                        dash_reading = "い"
-                                    else:
-                                        logger.warning(
-                                            f"Leaving dash in {word} {form} {reading}"
-                                        )
-                                        dash_reading = "ー"
-                                elif char_before_dash in "ぬつづむるくぐすずゆゅふぶぷ":
-                                    dash_reading = "う"
-                                elif char_before_dash in "しじみいきぎひびち":
-                                    dash_reading = "い"
-                                elif char_before_dash in "そぞのこごもろとどよょおほぼぽ":
-                                    digraphA = char_before_dash + "お"
-                                    digraphB = char_before_dash + "う"
-                                    if digraphA in form and digraphB not in form:
-                                        dash_reading = "お"
-                                    elif digraphB in form and digraphA not in form:
-                                        dash_reading = "う"
-                                    else:
-                                        if digraphA in word and digraphB not in word:
-                                            dash_reading = "お"
-                                        elif digraphB in word and digraphA not in word:
-                                            dash_reading = "う"
-                                        else:
-                                            logger.warning(
-                                                f"Leaving dash in {word} {form} {reading}"
-                                            )
-                                            dash_reading = "ー"
-                                else:
-                                    logger.warning(
-                                        f"Leaving dash in {word} {form} {reading}"
-                                    )
-                                    dash_reading = "ー"
-                            reading = (
-                                reading[:index_of_dash]
-                                + dash_reading
-                                + reading[index_of_dash + 1 :]
-                            )
-                    furigana_sentence += RubyToken.from_furi(word, reading).to_code()
-            furigana_sentences.append(furigana_sentence)
-        furigana_sentences = [
-            utils.standardize_text(sentence) for sentence in furigana_sentences
-        ]
-        sentences = [utils.remove_furigana(sentence) for sentence in furigana_sentences]
-        try:
-            rowid = row["サンプル ID"]
-        except KeyError:
-            rowid = row["講演 ID"]
-        if len(furigana_sentences) == 1:
-            ids = [rowid]
-        else:
-            ids = [rowid + "_" + str(i) for i in range(len(furigana_sentences))]
-        sub_df = pd.DataFrame(
-            {"sentence": sentences, "furigana": furigana_sentences, "sentenceid": ids}
-        )
-        sub_df = sub_df[sub_df["sentence"] != sub_df["furigana"]]
-        return sub_df
-    output_df = pd.DataFrame()
-    for i, row in df.iterrows():
-        output_df = output_df.append(get_sentences(row))
-    return output_df
-def bccwj_data():
-    """Extract, load and transform the bccwj data"""
-    # Extract sentences from the data files
-    bccwj_files = list(Path(config.RAW_DATA_DIR, "bccwj").glob("*.txt"))
-    df = pd.DataFrame()
-    for bccwj_file in bccwj_files:
-        logger.info(bccwj_file.name)
-        df = pd.concat([df, read_bccwj_file(bccwj_file)])
-    # remove known errors
-    error_ids = []
-    df = df[~df["sentenceid"].isin(error_ids)]
-    df = df[df["sentence"] != ""]
-    df = df.drop_duplicates()
-    df["furigana"] = df["furigana"].apply(utils.standardize_text)
-    df["sentence"] = df["sentence"].apply(utils.standardize_text)
-    assert (df["sentence"] == df["furigana"].apply(utils.remove_furigana)).all()
-    # Output
-    df.to_csv(Path(config.SENTENCE_DATA_DIR, "bccwj.csv"), index=False)
-    logger.info("✅ Saved bccwj data!")
-def bccwj_subset(bccwj_file):
-    """Extract, load and transform a subset of the bccwj data"""
-    df = read_bccwj_file(bccwj_file)
-    # remove known errors
-    error_ids = []
-    df = df[~df["sentenceid"].isin(error_ids)]
-    df = df.drop_duplicates()
-    df["furigana"] = df["furigana"].apply(utils.standardize_text)
-    df["sentence"] = df["sentence"].apply(utils.standardize_text)
-    # Output
-    df.to_csv(
-        Path(config.SENTENCE_DATA_DIR, bccwj_file.name.split(".")[0] + ".csv"),
-        index=False,
-    )
-    logger.info("✅ Saved bccwj " + bccwj_file.name.split(".")[0] + " data!")
-if __name__ == "__main__":
-    bccwj_data()

yomikata/dataset/kwdlc.py DELETED Viewed

@@ -1,109 +0,0 @@
-"""kwdlc.py
-Data processing script for KWDLC files directly in the repository format
-KWDLC repository: https://github.com/ku-nlp/KWDLC
-"""
-import warnings
-from pathlib import Path
-import pandas as pd
-from speach import ttlig
-from config import config
-from config.config import logger
-from yomikata import utils
-warnings.filterwarnings("ignore")
-def read_knp_file(filename: str):
-    with open(filename) as f:
-        contents = f.readlines()
-    ids = []
-    sentences = []
-    furiganas = []
-    sentence = ""
-    furigana = ""
-    for row in contents:
-        first_word = row.split(" ")[0]
-        if first_word in ["*", "+"]:
-            pass
-        elif first_word == "#":
-            sentence_id = row.split(" ")[1].split("S-ID:")[1]
-        elif first_word == "EOS\n":
-            sentence = utils.standardize_text(sentence)
-            furigana = utils.standardize_text(furigana)
-            if sentence == utils.remove_furigana(furigana):
-                sentences.append(sentence)
-                furiganas.append(furigana)
-                ids.append(sentence_id)
-            else:
-                logger.warning(
-                    f"Dropping mismatched line \n Sentence: {sentence} \n  Furigana: {furigana}"
-                )
-            sentence = ""
-            furigana = ""
-        else:
-            words = row.split(" ")
-            sentence += words[0]
-            if words[0] == words[1]:
-                furigana += words[0]
-            else:
-                furigana += ttlig.RubyToken.from_furi(words[0], words[1]).to_code()
-    assert len(ids) == len(sentences)
-    assert len(sentences) == len(furiganas)
-    return ids, sentences, furiganas  # readings
-def kwdlc_data():
-    """Extract, load and transform the kwdlc data"""
-    # Extract sentences from the data files
-    knp_files = list(Path(config.RAW_DATA_DIR, "kwdlc").glob("**/*.knp"))
-    all_ids = []
-    all_sentences = []
-    all_furiganas = []
-    for knp_file in knp_files:
-        ids, sentences, furiganas = read_knp_file(knp_file)
-        all_ids += ids
-        all_sentences += sentences
-        all_furiganas += furiganas
-    # construct dataframe
-    df = pd.DataFrame(
-        list(
-            zip(all_sentences, all_furiganas, all_ids)
-        ),  # all_readings, all_furiganas)),
-        columns=["sentence", "furigana", "sentenceid"],
-    )
-    # remove known errors
-    error_ids = [
-        "w201106-0000547376-1",
-        "w201106-0001768070-1-01",
-        "w201106-0000785999-1",
-        "w201106-0001500842-1",
-        "w201106-0000704257-1",
-        "w201106-0002300346-3",
-        "w201106-0001779669-3",
-        "w201106-0000259203-1",
-    ]
-    df = df[~df["sentenceid"].isin(error_ids)]
-    df = df.drop_duplicates()
-    df["furigana"] = df["furigana"].apply(utils.standardize_text)
-    df["sentence"] = df["sentence"].apply(utils.standardize_text)
-    # Test
-    assert (df["sentence"] == df["furigana"].apply(utils.remove_furigana)).all()
-    # Output
-    df.to_csv(Path(config.SENTENCE_DATA_DIR, "kwdlc.csv"), index=False)
-    logger.info("✅ Saved kwdlc data!")
-if __name__ == "__main__":
-    kwdlc_data()

yomikata/dataset/ndlbib.py DELETED Viewed

@@ -1,46 +0,0 @@
-"""ndlbib.py
-Data processing script for ndlbib sentence file from https://github.com/ndl-lab/huriganacorpus-ndlbib
-"""
-import warnings
-from pathlib import Path
-from pandas.errors import ParserError
-from config import config
-from config.config import logger
-from yomikata.dataset.aozora import read_file
-# ndlbib and aozora use same file structure
-warnings.filterwarnings("ignore")
-def ndlbib_data():
-    """Extract, load and transform the ndlbib data"""
-    # Extract sentences from the data files
-    files = list(Path(config.RAW_DATA_DIR, "shosi").glob("*.txt"))
-    with open(Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"), "w") as f:
-        f.write("sentence,furigana,sentenceid\n")
-    for i, file in enumerate(files):
-        logger.info(f"{i+1}/{len(files)} {file.name}")
-        try:
-            df = read_file(file)
-        except ParserError:
-            logger.error(f"Parser error on {file}")
-        df.to_csv(
-            Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"),
-            mode="a",
-            index=False,
-            header=False,
-        )
-    logger.info("✅ Saved ndlbib data!")
-if __name__ == "__main__":
-    ndlbib_data()

yomikata/dataset/pronunciations.py DELETED Viewed

@@ -1,57 +0,0 @@
-from pathlib import Path
-import jaconv
-import pandas as pd
-from tqdm import tqdm
-from config import config
-from config.config import logger
-from yomikata import utils
-def pronunciation_data():
-    data_files = list(Path(config.READING_DATA_DIR).glob("*.csv"))
-    df = pd.DataFrame()
-    for file in data_files:
-        if (file.name == "all.csv") or (file.name == "ambiguous.csv"):
-            continue
-        output_df = pd.read_csv(file)
-        df = pd.concat([df, output_df])
-    df["surface"] = df["surface"].astype(str).str.strip()
-    df["kana"] = df["kana"].astype(str).str.strip()
-    tqdm.pandas()
-    df["kana"] = df["kana"].progress_apply(utils.standardize_text)
-    df["surface"] = df["surface"].progress_apply(utils.standardize_text)
-    df["kana"] = df.progress_apply(lambda row: jaconv.kata2hira(row["kana"]), axis=1)
-    df = df[df["surface"] != df["kana"]]
-    df = df[df["kana"] != ""]
-    df = df[df["surface"].progress_apply(utils.has_kanji)]
-    df = df.loc[~df["surface"].str.contains(r"[〜〜（）\)\(\*]\.")]
-    df = df[["surface", "kana"]]
-    df = df.drop_duplicates()
-    df.to_csv(Path(config.READING_DATA_DIR, "all.csv"), index=False)
-    logger.info("✅ Merged all the pronunciation data!")
-    # merged_df = (
-    #     df.groupby("surface")["kana"]
-    #     .apply(list)
-    #     .reset_index(name="pronunciations")
-    # )
-    # ambiguous_df = merged_df[merged_df["pronunciations"].apply(len) > 1]
-    # ambiguous_df.to_csv(Path(config.READING_DATA_DIR, "ambiguous.csv"), index=False)
-if __name__ == "__main__":
-    pronunciation_data()

yomikata/dataset/repair_long_vowels.py DELETED Viewed

@@ -1,62 +0,0 @@
-from pathlib import Path
-import pandas as pd
-from config import config
-from config.config import logger
-pronunciation_df = pd.read_csv(Path(config.READING_DATA_DIR, "all.csv"))
-pronunciation_df = pronunciation_df.groupby("surface")["kana"].apply(list)
-def repair_long_vowels(kana: str, kanji: str = None) -> str:
-    """Clean and normalize text
-    Args:
-        kana (str): input string
-        kanji (str): input string, optional
-    Returns:
-        str: a cleaned string
-    """
-    reading = kana
-    indices_of_dash = [pos for pos, char in enumerate(reading) if char == "ー"]
-    # get rid of non-ambiguous dashes
-    for index_of_dash in indices_of_dash:
-        char_before_dash = reading[index_of_dash - 1]
-        if char_before_dash in "ぬつづむるくぐすずゆゅふぶぷ":
-            reading = reading[:index_of_dash] + "う" + reading[index_of_dash + 1 :]
-        elif char_before_dash in "しじみいきぎひびちぢぃ":
-            reading = reading[:index_of_dash] + "い" + reading[index_of_dash + 1 :]
-    indices_of_not_dash = [pos for pos, char in enumerate(reading) if char != "ー"]
-    if len(indices_of_not_dash) != len(reading):
-        if not kanji:
-            logger.info("Disambiguating this dash requires kanji")
-            logger.info(f"Left dash in {reading}")
-        else:
-            try:
-                candidate_pronunciations = list(pronunciation_df[kanji])
-            except KeyError:
-                candidate_pronunciations = []
-            candidate_pronunciations = list(set(candidate_pronunciations))
-            candidate_pronunciations = [
-                x for x in candidate_pronunciations if len(x) == len(reading)
-            ]
-            candidate_pronunciations = [
-                x
-                for x in candidate_pronunciations
-                if all([x[i] == reading[i] for i in indices_of_not_dash])
-            ]
-            if len(candidate_pronunciations) == 1:
-                reading = candidate_pronunciations[0]
-            else:
-                pass
-                # logger.warning(f"Left dashes in {kanji} {reading}")
-    return reading

yomikata/dataset/split.py DELETED Viewed

@@ -1,271 +0,0 @@
-from pathlib import Path
-import pandas as pd
-from sklearn.model_selection import train_test_split
-from speach.ttlig import RubyFrag, RubyToken
-from config import config
-from config.config import logger
-from yomikata import utils
-from yomikata.dictionary import Dictionary
-def train_val_test_split(X, y, train_size, val_size, test_size):
-    """Split dataset into data splits."""
-    assert (train_size + val_size + test_size) == 1
-    X_train, X_, y_train, y_ = train_test_split(X, y, train_size=train_size)
-    X_val, X_test, y_val, y_test = train_test_split(
-        X_, y_, train_size=val_size / (test_size + val_size)
-    )
-    return X_train, X_val, X_test, y_train, y_val, y_test
-def filter_simple(input_file, output_file, heteronyms) -> None:
-    """This filters out sentences which don't contain any heteronyms"""
-    df = pd.read_csv(input_file)  # load
-    logger.info(f"Prefilter size: {len(df)}")
-    df = df[df["sentence"].str.contains(r"|".join(heteronyms))]
-    logger.info(f"Postfilter size: {len(df)}")
-    df.to_csv(output_file, index=False)
-def filter_dictionary(input_file, output_file, heteronyms, dictionary) -> None:
-    """This filters out sentences which contain heteronyms only as part of a compound which is known to the dictionary"""
-    df = pd.read_csv(input_file)  # load
-    logger.info(f"Prefilter size: {len(df)}")
-    df["contains_heteronym"] = df["sentence"].apply(
-        lambda s: not set(
-            [dictionary.token_to_surface(m) for m in dictionary.tagger(s)]
-        ).isdisjoint(heteronyms)
-    )
-    df = df[df["contains_heteronym"]]
-    logger.info(f"Postfilter size: {len(df)}")
-    df.to_csv(output_file, index=False)
-def regroup_furigana(s, heteronym, heteronym_dict, dictionary, verbose=False):
-    rubytokens = utils.parse_furigana(s)
-    output_tokens = []
-    for token in rubytokens.groups:
-        if isinstance(token, RubyFrag):
-            # this is a token with furigana
-            if heteronym in token.text and token.text != heteronym:
-                # it includes the heteronym but is not exactly the heteronym
-                # if len(dictionary.tagger(token.text)) > 1:
-                # it is not in the dictionary, so we try to regroup it
-                # note this dictionary check is not foolproof: sometimes words are in the dictionary and found here,
-                # but in a parse of the whole sentence the word will be split in two.
-                # commented this out since actually even if it is part of dictionary, it will go through the training and so we might as well try to regroup it to avoid it being an <OTHER>
-                viable_regroupings = []
-                for reading in heteronym_dict[heteronym]:
-                    regrouped_tokens = regroup_furigana_tokens(
-                        [token], heteronym, reading, verbose=verbose
-                    )
-                    if regrouped_tokens != [token]:
-                        if verbose:
-                            print("viable regrouping found")
-                        viable_regroupings.append(regrouped_tokens)
-                if len(viable_regroupings) == 1:
-                    output_tokens += viable_regroupings[0]
-                    continue
-                else:
-                    if verbose:
-                        print("multiple viable readings found, cannot regroup")
-                    pass
-        output_tokens.append(token)
-    output_string = RubyToken(groups=output_tokens).to_code()
-    assert utils.furigana_to_kana(output_string) == utils.furigana_to_kana(s)
-    assert utils.remove_furigana(output_string) == utils.remove_furigana(s)
-    return output_string
-def regroup_furigana_tokens(ruby_tokens, heteronym, reading, verbose=False):
-    if not len(ruby_tokens) == 1:
-        raise ValueError("regroup failed, no support yet for token merging")
-    ruby_token = ruby_tokens[0]
-    text = ruby_token.text
-    furi = ruby_token.furi
-    try:
-        split_text = [
-            text[0 : text.index(heteronym)],
-            heteronym,
-            text[text.index(heteronym) + len(heteronym) :],
-        ]
-        split_text = [text for text in split_text if text != ""]
-    except ValueError:
-        if verbose:
-            print("regroup failed, heteronym not in token text")
-        return ruby_tokens
-    try:
-        split_furi = [
-            furi[0 : furi.index(reading)],
-            reading,
-            furi[furi.index(reading) + len(reading) :],
-        ]
-        split_furi = [furi for furi in split_furi if furi != ""]
-    except ValueError:
-        if verbose:
-            print("regroup failed, reading not in token furi")
-        return ruby_tokens
-    if not len(split_text) == len(split_furi):
-        if verbose:
-            print(
-                "regroup failed, failed to find heteronym and its reading in the same place in the inputs"
-            )
-        return ruby_tokens
-    regrouped_tokens = [
-        RubyFrag(text=split_text[i], furi=split_furi[i]) for i in range(len(split_text))
-    ]
-    if not "".join([token.furi for token in ruby_tokens]) == "".join(
-        [token.furi for token in regrouped_tokens]
-    ):
-        if verbose:
-            print(
-                "regroup failed, reading of produced result does not agree with reading of input"
-            )
-        return ruby_tokens
-    if not [token.furi for token in regrouped_tokens if token.text == heteronym] == [
-        reading
-    ]:
-        if verbose:
-            print("regroup failed, the heteronym did not get assigned the reading")
-        return ruby_tokens
-    return regrouped_tokens
-def optimize_furigana(input_file, output_file, heteronym_dict, dictionary) -> None:
-    df = pd.read_csv(input_file)  # load
-    logger.info("Optimizing furigana using heteronym list and dictionary")
-    for heteronym in heteronym_dict.keys():
-        logger.info(f"Heteronym {heteronym} {heteronym_dict[heteronym]}")
-        n_with_het = sum(df["sentence"].str.contains(heteronym))
-        rows_to_rearrange = df["sentence"].str.contains(heteronym)
-        optimized_rows = df.loc[rows_to_rearrange, "furigana"].apply(
-            lambda s: regroup_furigana(s, heteronym, heteronym_dict, dictionary)
-        )
-        n_rearranged = sum(df.loc[rows_to_rearrange, "furigana"] != optimized_rows)
-        logger.info(f"{n_rearranged}/{n_with_het} sentences were optimized")
-        df.loc[rows_to_rearrange, "furigana"] = optimized_rows
-    df.to_csv(output_file, index=False)
-def remove_other_readings(input_file, output_file, heteronym_dict):
-    df = pd.read_csv(input_file)  # load
-    logger.info(f"Prefilter size: {len(df)}")
-    df["keep_row"] = False
-    for heteronym in heteronym_dict.keys():
-        logger.info(heteronym)
-        n_with_het = sum(df["sentence"].str.contains(heteronym))
-        keep_for_het = df["furigana"].str.contains(
-            r"|".join(
-                [f"{{{heteronym}/{reading}}}" for reading in heteronym_dict[heteronym]]
-            )
-        )
-        df["keep_row"] = df["keep_row"] | keep_for_het
-        logger.info(
-            f"Dropped {n_with_het-sum(keep_for_het)}/{n_with_het} sentences which have different readings"
-        )  # TODO reword
-    df = df.loc[df["keep_row"]]
-    df = df.drop("keep_row", axis=1)
-    df.to_csv(output_file, index=False)
-def check_data(input_file) -> bool:
-    df = pd.read_csv(input_file)  # load
-    df["furigana-test"] = df["sentence"] == df["furigana"].apply(utils.remove_furigana)
-    assert df["furigana-test"].all()
-    df["sentence-standardize-test"] = df["sentence"] == df["sentence"].apply(
-        utils.standardize_text
-    )
-    assert df["sentence-standardize-test"].all()
-    return True
-def split_data(data_file) -> None:
-    df = pd.read_csv(data_file)  # load
-    X = df["sentence"].values
-    y = df["furigana"].values
-    (X_train, X_val, X_test, y_train, y_val, y_test) = train_val_test_split(
-        X=X,
-        y=y,
-        train_size=config.TRAIN_SIZE,
-        val_size=config.VAL_SIZE,
-        test_size=config.TEST_SIZE,
-    )
-    train_df = pd.DataFrame({"sentence": X_train, "furigana": y_train})
-    val_df = pd.DataFrame({"sentence": X_val, "furigana": y_val})
-    test_df = pd.DataFrame({"sentence": X_test, "furigana": y_test})
-    train_df.to_csv(Path(config.TRAIN_DATA_DIR, "train_" + data_file.name), index=False)
-    val_df.to_csv(Path(config.VAL_DATA_DIR, "val_" + data_file.name), index=False)
-    test_df.to_csv(Path(config.TEST_DATA_DIR, "test_" + data_file.name), index=False)
-if __name__ == "__main__":
-    input_files = [
-        Path(config.SENTENCE_DATA_DIR, "aozora.csv"),
-        Path(config.SENTENCE_DATA_DIR, "kwdlc.csv"),
-        Path(config.SENTENCE_DATA_DIR, "bccwj.csv"),
-        Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"),
-    ]
-    logger.info("Merging sentence data")
-    utils.merge_csvs(input_files, Path(config.SENTENCE_DATA_DIR, "all.csv"), n_header=1)
-    logger.info("Rough filtering for sentences with heteronyms")
-    filter_simple(
-        Path(config.SENTENCE_DATA_DIR, "all.csv"),
-        Path(config.SENTENCE_DATA_DIR, "have_heteronyms_simple.csv"),
-        config.HETERONYMS.keys(),
-    )
-    logger.info("Sudachidict filtering for out heteronyms in known compounds")
-    filter_dictionary(
-        Path(config.SENTENCE_DATA_DIR, "have_heteronyms_simple.csv"),
-        Path(config.SENTENCE_DATA_DIR, "have_heteronyms.csv"),
-        config.HETERONYMS.keys(),
-        Dictionary("sudachi"),
-    )
-    logger.info("Optimizing furigana")
-    optimize_furigana(
-        Path(config.SENTENCE_DATA_DIR, "have_heteronyms.csv"),
-        Path(config.SENTENCE_DATA_DIR, "optimized_heteronyms.csv"),
-        config.HETERONYMS,
-        Dictionary("sudachi"),
-    )
-    logger.info("Removing heteronyms with unexpected readings")
-    remove_other_readings(
-        Path(config.SENTENCE_DATA_DIR, "optimized_heteronyms.csv"),
-        Path(config.SENTENCE_DATA_DIR, "optimized_strict_heteronyms.csv"),
-        config.HETERONYMS,
-    )
-    logger.info("Running checks on data")
-    test_result = check_data(
-        Path(config.SENTENCE_DATA_DIR, "optimized_strict_heteronyms.csv")
-    )
-    logger.info("Performing train/test/split")
-    split_data(Path(config.SENTENCE_DATA_DIR, "optimized_strict_heteronyms.csv"))
-    logger.info("Data splits successfully generated!")

yomikata/dataset/sudachi.py DELETED Viewed

@@ -1,50 +0,0 @@
-"""sudachi.py
-Data processing script for sudachi dictionary
-"""
-import warnings
-from pathlib import Path
-import pandas as pd
-from config import config
-from config.config import logger
-warnings.filterwarnings("ignore")
-def sudachi_data():
-    sudachi_file = list(Path(config.RAW_DATA_DIR, "sudachi").glob("*.csv"))
-    df = pd.DataFrame()
-    for file in sudachi_file:
-        logger.info(file.name)
-        # Load file
-        df = pd.concat(
-            [
-                df,
-                pd.read_csv(
-                    file,
-                    header=None,
-                ),
-            ]
-        )
-    df["surface"] = df[0].astype(str).str.strip()
-    df["kana"] = df[11].astype(str).str.strip()
-    df["type"] = df[5].astype(str).str.strip()
-    df = df[df["kana"] != "*"]
-    df = df[df["surface"] != df["kana"]]
-    df = df[df["type"] != "補助記号"]
-    df = df[["surface", "kana"]]
-    df.to_csv(Path(config.READING_DATA_DIR, "sudachi.csv"), index=False)
-    logger.info("✅ Processed sudachi data!")
-if __name__ == "__main__":
-    sudachi_data()

yomikata/dataset/unidic.py DELETED Viewed

@@ -1,44 +0,0 @@
-"""unidic.py
-Data processing script for unidic dictionary
-"""
-import warnings
-from pathlib import Path
-import pandas as pd
-from config import config
-from config.config import logger
-warnings.filterwarnings("ignore")
-def unidic_data():
-    """Extract, load and transform the unidic data"""
-    # Extract sentences from the data files
-    unidic_file = list(Path(config.RAW_DATA_DIR, "unidic").glob("*.csv"))[0]
-    # Load file
-    df = pd.read_csv(
-        unidic_file,
-        header=None,
-        names="surface id1 id2 id3 pos1 pos2 pos3 pos4 cType "
-        "cForm lForm lemma orth orthBase pron pronBase goshu iType iForm fType "
-        "fForm iConType fConType type kana kanaBase form formBase aType aConType "
-        "aModType lid lemma_id".split(" "),
-    )
-    df["surface"] = df["surface"].astype(str).str.strip()
-    df["kana"] = df["kana"].astype(str).str.strip()
-    df = df[df["kana"] != "*"]
-    df = df[df["surface"] != df["kana"]]
-    df = df[["surface", "kana"]]
-    df.to_csv(Path(config.READING_DATA_DIR, "unidic.csv"), index=False)
-    logger.info("✅ Processed unidic data!")
-if __name__ == "__main__":
-    unidic_data()

yomikata/main.py DELETED Viewed

@@ -1,123 +0,0 @@
-"""main.py
-Main entry point for training
-"""
-import sys
-import tempfile
-import warnings
-from argparse import Namespace
-from pathlib import Path
-import mlflow
-from datasets import load_dataset
-from config import config
-from config.config import logger
-from yomikata import utils
-from yomikata.dbert import dBert
-# MLFlow model registry
-mlflow.set_tracking_uri("file://" + str(config.RUN_REGISTRY.absolute()))
-warnings.filterwarnings("ignore")
-def train_model(
-    model_name: "dBert",
-    dataset_name: str = "",
-    experiment_name: str = "baselines",
-    run_name: str = "dbert-default",
-    training_args: dict = {},
-) -> None:
-    """Train a model given arguments.
-    Args:
-        dataset_name (str): name of the dataset to be trained on. Defaults to the full dataset.
-        args_fp (str): location of args.
-        experiment_name (str): name of experiment.
-        run_name (str): name of specific run in experiment.
-    """
-    mlflow.set_experiment(experiment_name=experiment_name)
-    with mlflow.start_run(run_name=run_name):
-        run_id = mlflow.active_run().info.run_id
-        logger.info(f"Run ID: {run_id}")
-        experiment_id = mlflow.get_run(run_id=run_id).info.experiment_id
-        artifacts_dir = Path(config.RUN_REGISTRY, experiment_id, run_id, "artifacts")
-        # Initialize the model
-        if model_name == "dBert":
-            reader = dBert(reinitialize=True, artifacts_dir=artifacts_dir)
-        else:
-            raise ValueError("model_name must be dBert for now")
-        # Load train val test data
-        dataset = load_dataset(
-            "csv",
-            data_files={
-                "train": str(
-                    Path(config.TRAIN_DATA_DIR, "train_" + dataset_name + ".csv")
-                ),
-                "val": str(Path(config.VAL_DATA_DIR, "val_" + dataset_name + ".csv")),
-                "test": str(
-                    Path(config.TEST_DATA_DIR, "test_" + dataset_name + ".csv")
-                ),
-            },
-        )
-        # Train
-        training_performance = reader.train(dataset, training_args=training_args)
-        # general_performance = evaluate.evaluate(reader, max_evals=20)
-        with tempfile.TemporaryDirectory() as dp:
-            # reader.save(dp)
-            # utils.save_dict(general_performance, Path(dp, "general_performance.json"))
-            utils.save_dict(training_performance, Path(dp, "training_performance.json"))
-            mlflow.log_artifacts(dp)
-def get_artifacts_dir_from_run(run_id: str):
-    """Load artifacts directory for a given run_id.
-    Args:
-        run_id (str): id of run to load artifacts from.
-    Returns:
-        Path: path to artifacts directory.
-    """
-    # Locate specifics artifacts directory
-    experiment_id = mlflow.get_run(run_id=run_id).info.experiment_id
-    artifacts_dir = Path(config.RUN_REGISTRY, experiment_id, run_id, "artifacts")
-    return artifacts_dir
-if __name__ == "__main__":
-    # get args filepath from input
-    args_fp = sys.argv[1]
-    # load the args_file
-    args = Namespace(**utils.load_dict(filepath=args_fp)).__dict__
-    # pop meta variables
-    model_name = args.pop("model")
-    dataset_name = args.pop("dataset")
-    experiment_name = args.pop("experiment")
-    run_name = args.pop("run")
-    # Perform training
-    train_model(
-        model_name=model_name,
-        dataset_name=dataset_name,
-        experiment_name=experiment_name,
-        run_name=run_name,
-        training_args=args,
-    )