File size: 741 Bytes
ade3b7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import re

import duckdb
from datasets import load_dataset

definitions_ds = load_dataset("adorkin/sonajaht", "definitions")
words_ds = load_dataset("adorkin/sonajaht", "words")

definitions = definitions_ds["definitions"].to_pandas()
definitions.value = definitions.value.str.replace(
    re.compile(r"<[^>]*>"), "", regex=True
).apply(lambda el: " ".join(el.split()))

definitions = duckdb.query(
    "SELECT * FROM definitions WHERE lang = 'est' AND LENGTH(value) > 5"
).df()

definitions.reset_index(inplace=True, names="entry_id")

words = words_ds["words"].to_pandas()

conn = duckdb.connect("sonajaht.db")
conn.execute("CREATE TABLE definitions AS SELECT * FROM definitions")
conn.execute("CREATE TABLE words AS SELECT * FROM words")