import re import duckdb from datasets import load_dataset definitions_ds = load_dataset("adorkin/sonajaht", "definitions") words_ds = load_dataset("adorkin/sonajaht", "words") definitions = definitions_ds["definitions"].to_pandas() definitions.value = definitions.value.str.replace( re.compile(r"<[^>]*>"), "", regex=True ).apply(lambda el: " ".join(el.split())) definitions = duckdb.query( "SELECT * FROM definitions WHERE lang = 'est' AND LENGTH(value) > 5" ).df() definitions.reset_index(inplace=True, names="entry_id") words = words_ds["words"].to_pandas() conn = duckdb.connect("sonajaht.db") conn.execute("CREATE TABLE definitions AS SELECT * FROM definitions") conn.execute("CREATE TABLE words AS SELECT * FROM words")