adowu commited on
Commit
331c08f
1 Parent(s): 82d91f6

Update database.py

Browse files
Files changed (1) hide show
  1. database.py +7 -7
database.py CHANGED
@@ -20,12 +20,12 @@ class KodeksProcessor:
20
 
21
  def extract_metadata(self, text: str) -> Dict:
22
  metadata = {}
23
- dz_u_match = re.search(r'Dz\.U\.(\\d{4})\.(\\d+)\.(\\d+)', text)
24
  if dz_u_match:
25
  metadata['dz_u'] = f"Dz.U.{dz_u_match.group(1)}.{dz_u_match.group(2)}.{dz_u_match.group(3)}"
26
  metadata['rok'] = dz_u_match.group(1)
27
 
28
- nazwa_match = re.search(r'USTAWA\\s+z dnia(.*?)\\n(.*?)\\n', text)
29
  if nazwa_match:
30
  metadata['data_ustawy'] = nazwa_match.group(1).strip()
31
  metadata['nazwa'] = nazwa_match.group(2).strip()
@@ -39,10 +39,10 @@ class KodeksProcessor:
39
  return "", text
40
 
41
  def process_article(self, article_text: str) -> Dict:
42
- art_num_match = re.match(r'Art\\.\\s*(\\d+)', article_text)
43
  article_num = art_num_match.group(1) if art_num_match else ""
44
 
45
- paragraphs = re.findall(r'§\\s*(\\d+)[.\\s]+(.*?)(?=§\\s*\\d+|$)', article_text, re.DOTALL)
46
 
47
  if not paragraphs:
48
  return {
@@ -59,7 +59,7 @@ class KodeksProcessor:
59
 
60
  def split_into_chunks(self, text: str, metadata: Dict) -> List[Dict]:
61
  chunks = []
62
- chapters = re.split(r'(Rozdział \\d+\\n\\n[^\\n]+)\\n', text)
63
  current_chapter = ""
64
 
65
  for i, section in enumerate(chapters):
@@ -67,7 +67,7 @@ class KodeksProcessor:
67
  current_chapter = section.strip()
68
  continue
69
 
70
- articles = re.split(r'(Art\\.\\s*\\d+.*?)(?=Art\\.\\s*\\d+|$)', section)
71
 
72
  for article in articles:
73
  if not article.strip():
@@ -128,4 +128,4 @@ class KodeksProcessor:
128
  query_texts=[query],
129
  n_results=n_results
130
  )
131
- return results
 
20
 
21
  def extract_metadata(self, text: str) -> Dict:
22
  metadata = {}
23
+ dz_u_match = re.search(r'Dz\.U\.(\d{4})\.(\d+)\.(\d+)', text)
24
  if dz_u_match:
25
  metadata['dz_u'] = f"Dz.U.{dz_u_match.group(1)}.{dz_u_match.group(2)}.{dz_u_match.group(3)}"
26
  metadata['rok'] = dz_u_match.group(1)
27
 
28
+ nazwa_match = re.search(r'USTAWA\s+z dnia(.*?)\n(.*?)\n', text)
29
  if nazwa_match:
30
  metadata['data_ustawy'] = nazwa_match.group(1).strip()
31
  metadata['nazwa'] = nazwa_match.group(2).strip()
 
39
  return "", text
40
 
41
  def process_article(self, article_text: str) -> Dict:
42
+ art_num_match = re.match(r'Art\.\s*(\d+)', article_text)
43
  article_num = art_num_match.group(1) if art_num_match else ""
44
 
45
+ paragraphs = re.findall(r'§\s*(\d+)[.\s]+(.*?)(?=§\s*\d+|$)', article_text, re.DOTALL)
46
 
47
  if not paragraphs:
48
  return {
 
59
 
60
  def split_into_chunks(self, text: str, metadata: Dict) -> List[Dict]:
61
  chunks = []
62
+ chapters = re.split(r'(Rozdział \d+\n\n[^\\n]+)\n', text)
63
  current_chapter = ""
64
 
65
  for i, section in enumerate(chapters):
 
67
  current_chapter = section.strip()
68
  continue
69
 
70
+ articles = re.split(r'(Art\.\s*\d+.*?)(?=Art\.\s*\d+|$)', section)
71
 
72
  for article in articles:
73
  if not article.strip():
 
128
  query_texts=[query],
129
  n_results=n_results
130
  )
131
+ return results