Update database.py
Browse files- database.py +7 -7
database.py
CHANGED
@@ -20,12 +20,12 @@ class KodeksProcessor:
|
|
20 |
|
21 |
def extract_metadata(self, text: str) -> Dict:
|
22 |
metadata = {}
|
23 |
-
dz_u_match = re.search(r'Dz\.U\.(
|
24 |
if dz_u_match:
|
25 |
metadata['dz_u'] = f"Dz.U.{dz_u_match.group(1)}.{dz_u_match.group(2)}.{dz_u_match.group(3)}"
|
26 |
metadata['rok'] = dz_u_match.group(1)
|
27 |
|
28 |
-
nazwa_match = re.search(r'USTAWA
|
29 |
if nazwa_match:
|
30 |
metadata['data_ustawy'] = nazwa_match.group(1).strip()
|
31 |
metadata['nazwa'] = nazwa_match.group(2).strip()
|
@@ -39,10 +39,10 @@ class KodeksProcessor:
|
|
39 |
return "", text
|
40 |
|
41 |
def process_article(self, article_text: str) -> Dict:
|
42 |
-
art_num_match = re.match(r'Art
|
43 |
article_num = art_num_match.group(1) if art_num_match else ""
|
44 |
|
45 |
-
paragraphs = re.findall(r'
|
46 |
|
47 |
if not paragraphs:
|
48 |
return {
|
@@ -59,7 +59,7 @@ class KodeksProcessor:
|
|
59 |
|
60 |
def split_into_chunks(self, text: str, metadata: Dict) -> List[Dict]:
|
61 |
chunks = []
|
62 |
-
chapters = re.split(r'(Rozdział
|
63 |
current_chapter = ""
|
64 |
|
65 |
for i, section in enumerate(chapters):
|
@@ -67,7 +67,7 @@ class KodeksProcessor:
|
|
67 |
current_chapter = section.strip()
|
68 |
continue
|
69 |
|
70 |
-
articles = re.split(r'(Art
|
71 |
|
72 |
for article in articles:
|
73 |
if not article.strip():
|
@@ -128,4 +128,4 @@ class KodeksProcessor:
|
|
128 |
query_texts=[query],
|
129 |
n_results=n_results
|
130 |
)
|
131 |
-
return results
|
|
|
20 |
|
21 |
def extract_metadata(self, text: str) -> Dict:
|
22 |
metadata = {}
|
23 |
+
dz_u_match = re.search(r'Dz\.U\.(\d{4})\.(\d+)\.(\d+)', text)
|
24 |
if dz_u_match:
|
25 |
metadata['dz_u'] = f"Dz.U.{dz_u_match.group(1)}.{dz_u_match.group(2)}.{dz_u_match.group(3)}"
|
26 |
metadata['rok'] = dz_u_match.group(1)
|
27 |
|
28 |
+
nazwa_match = re.search(r'USTAWA\s+z dnia(.*?)\n(.*?)\n', text)
|
29 |
if nazwa_match:
|
30 |
metadata['data_ustawy'] = nazwa_match.group(1).strip()
|
31 |
metadata['nazwa'] = nazwa_match.group(2).strip()
|
|
|
39 |
return "", text
|
40 |
|
41 |
def process_article(self, article_text: str) -> Dict:
|
42 |
+
art_num_match = re.match(r'Art\.\s*(\d+)', article_text)
|
43 |
article_num = art_num_match.group(1) if art_num_match else ""
|
44 |
|
45 |
+
paragraphs = re.findall(r'§\s*(\d+)[.\s]+(.*?)(?=§\s*\d+|$)', article_text, re.DOTALL)
|
46 |
|
47 |
if not paragraphs:
|
48 |
return {
|
|
|
59 |
|
60 |
def split_into_chunks(self, text: str, metadata: Dict) -> List[Dict]:
|
61 |
chunks = []
|
62 |
+
chapters = re.split(r'(Rozdział \d+\n\n[^\\n]+)\n', text)
|
63 |
current_chapter = ""
|
64 |
|
65 |
for i, section in enumerate(chapters):
|
|
|
67 |
current_chapter = section.strip()
|
68 |
continue
|
69 |
|
70 |
+
articles = re.split(r'(Art\.\s*\d+.*?)(?=Art\.\s*\d+|$)', section)
|
71 |
|
72 |
for article in articles:
|
73 |
if not article.strip():
|
|
|
128 |
query_texts=[query],
|
129 |
n_results=n_results
|
130 |
)
|
131 |
+
return results
|