neuralworm commited on
Commit
6c0aa26
1 Parent(s): 26e9493

populate db, db optimize

Browse files
Files changed (2) hide show
  1. app.py +185 -157
  2. gematria.db +2 -2
app.py CHANGED
@@ -3,182 +3,210 @@ import json
3
  import re
4
  import sqlite3
5
  import logging
 
6
  from util import process_json_files
7
  from gematria import calculate_gematria
8
  from deep_translator import GoogleTranslator, exceptions
9
 
10
- logging.basicConfig(level=logging.INFO, format='%(message)s')
 
 
 
 
 
11
 
12
  def flatten_text(text):
13
- """Helper function to flatten nested lists into a single list."""
14
- if isinstance(text, list):
15
- return " ".join(flatten_text(item) if isinstance(item, list) else item for item in text)
16
- return text
17
 
18
  def initialize_database():
19
- conn = sqlite3.connect('gematria.db')
20
- c = conn.cursor()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  c.execute('''
22
- CREATE TABLE IF NOT EXISTS results (
23
- gematria_sum INTEGER,
24
- words TEXT,
25
- translation TEXT,
26
- book INTEGER,
27
- title TEXT,
28
- chapter INTEGER,
29
- verse INTEGER,
30
- UNIQUE(gematria_sum, words, book, title, chapter, verse)
31
- )
32
- ''')
 
 
 
 
 
 
33
  conn.commit()
34
- conn.close()
35
 
36
- def insert_phrase_to_db(c, gematria_sum, phrase_candidate, translation, book_id, title, chapter_id, verse_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  try:
38
- logging.info(f"Inserting: {gematria_sum}, {phrase_candidate}, {translation}, {book_id}, {title}, {chapter_id}, {verse_id}")
39
- c.execute('''
40
- INSERT INTO results (gematria_sum, words, translation, book, title, chapter, verse)
41
- VALUES (?, ?, ?, ?, ?, ?, ?)
42
- ''', (gematria_sum, phrase_candidate, translation, book_id, title, chapter_id, verse_id))
43
  except sqlite3.IntegrityError:
44
- logging.info(f"Entry already exists: {gematria_sum}, {phrase_candidate}, {book_id}, {title}, {chapter_id}, {verse_id}")
45
-
46
- def populate_database(tanach_texts, max_phrase_length=1):
47
- conn = sqlite3.connect('gematria.db')
48
- c = conn.cursor()
49
- for book_id, text in enumerate(tanach_texts):
50
- if 'text' not in text or not isinstance(text['text'], list):
51
- logging.warning(f"Skipping book {book_id} due to missing or invalid 'text' field.")
52
- continue
53
- title = text.get('title', 'Unknown')
54
- chapters = text['text']
55
- for chapter_id, chapter in enumerate(chapters):
56
- if not isinstance(chapter, list):
57
- logging.warning(f"Skipping chapter {chapter_id} in book {title} due to invalid format.")
58
- continue
59
- for verse_id, verse in enumerate(chapter):
60
- verse_text = flatten_text(verse)
61
- verse_text = re.sub(r"[^\u05D0-\u05EA ]+", "", verse_text)
62
- verse_text = re.sub(r" +", " ", verse_text)
63
- words = verse_text.split()
64
- max_length = min(max_phrase_length, len(words))
65
- for length in range(1, max_length + 1):
66
- for start in range(len(words) - length + 1):
67
- phrase_candidate = " ".join(words[start:start + length])
68
- gematria_sum = calculate_gematria(phrase_candidate.replace(" ", ""))
69
- insert_phrase_to_db(c, gematria_sum, phrase_candidate, None, book_id + 1, title, chapter_id + 1, verse_id + 1) # No translation initially
70
  conn.commit()
71
- conn.close()
72
-
73
- def get_translation_from_db(c, phrase, book, chapter, verse):
 
 
 
 
 
 
 
 
 
 
 
 
74
  c.execute('''
75
- SELECT translation FROM results
76
- WHERE words = ? AND book = ? AND chapter = ? AND verse = ?
77
- ''', (phrase, book, chapter, verse))
78
- result = c.fetchone()
79
- return result[0] if result else None
80
-
81
- def translate_and_store(conn, phrase, book, chapter, verse):
82
- translator = GoogleTranslator(source='iw', target='en') # Explicitly set source to Hebrew
83
- c = conn.cursor()
84
- try:
85
- translation = translator.translate(phrase)
86
- logging.info(f"Translated phrase: {translation}")
87
- c.execute('''
88
- UPDATE results
89
- SET translation = ?
90
- WHERE words = ? AND book = ? AND chapter = ? AND verse = ?
91
- ''', (translation, phrase, book, chapter, verse))
92
- conn.commit()
93
- return translation
94
- except (exceptions.TranslationNotFound, exceptions.NotValidPayload,
95
- exceptions.ServerException, exceptions.RequestError) as e:
96
- logging.error(f"Error translating phrase '{phrase}': {e}")
97
- return "[Translation Error]"
 
 
 
 
 
 
 
98
 
99
  def gematria_search_interface(phrase):
100
- debug_output = []
101
-
102
- def debug_callback(message):
103
- debug_output.append(message)
104
- logging.info(message)
105
-
106
- if not phrase.strip():
107
- return "Please enter a phrase.", "\n".join(debug_output)
108
-
109
- phrase_gematria = calculate_gematria(phrase.replace(" ", ""))
110
- debug_callback(f"Debug: Gematria of the search phrase '{phrase}' is {phrase_gematria}")
111
-
112
- conn = sqlite3.connect('gematria.db')
113
- c = conn.cursor()
114
- matching_phrases = search_gematria_in_db(c, phrase_gematria)
115
-
116
- if not matching_phrases:
117
- conn.close()
118
- return "No matching phrases found.", "\n".join(debug_output)
119
-
120
- # Sort matching phrases by book, chapter, and verse
121
- matching_phrases.sort(key=lambda x: (x[1], x[3], x[4]))
122
-
123
- result = "Matching phrases:\n"
124
- for match in matching_phrases:
125
- if len(match) != 6: # Adjusted length for added translation
126
- debug_callback(f"Error: Expected tuple of length 6, but got {len(match)}: {match}")
127
- continue
128
- words, book, title, chapter, verse, translation = match
129
- if not translation: # Check if translation exists
130
- translation = translate_and_store(conn, words, book, chapter, verse)
131
- result += f"Book: {title} ({book})\nChapter: {chapter}, Verse: {verse}\nPhrase: {words}\nTranslation: {translation}\n\n"
132
-
133
- conn.close()
134
- return result, "\n".join(debug_output)
135
-
136
- def search_gematria_in_db(c, gematria_sum):
137
- c.execute('''
138
- SELECT words, book, title, chapter, verse, translation FROM results WHERE gematria_sum = ?
139
- ''', (gematria_sum,))
140
- results = c.fetchall()
141
- logging.info(f"Search results: {results}")
142
- return results
143
-
144
- def run_test():
145
- debug_output = []
146
- #test_phrase = "אחר ואתבנימין ואני"
147
- #expected_gematria = 1495
148
-
149
- def debug_callback(message):
150
- debug_output.append(message)
151
- logging.info(message)
152
-
153
- # Load the test JSON contents for 01.json
154
- #test_texts_00 = process_json_files(0, 0)
155
- #test_texts_01 = process_json_files(1, 1)
156
- #populate_database(test_texts_00, max_phrase_length=22) # Populate the database from book 0 with phrases up to 22 words
157
- #populate_database(test_texts_01, max_phrase_length=3) # Populate the database from book 1 with phrases up to 3 words
158
- #conn = sqlite3.connect('gematria.db')
159
- #c = conn.cursor()
160
- #matching_phrases_01 = search_gematria_in_db(c, expected_gematria)
161
- #conn.close()
162
- #assert matching_phrases_01[0][0] == test_phrase, f"Found phrase does not match: {matching_phrases_01[0][0]}"
163
- #print("Test successful: The phrase was correctly found and the gematria matches in 01.json.")
164
- #print("\n".join(debug_output))
165
-
166
- test_texts = process_json_files(1, 39)
167
- populate_database(test_texts, max_phrase_length=1)
168
- #populate_database(test_texts, max_phrase_length=2)
169
- #populate_database(test_texts, max_phrase_length=3)
170
-
171
- iface = gr.Interface(
172
  fn=gematria_search_interface,
173
  inputs=gr.Textbox(label="Enter phrase"),
174
- outputs=[gr.Textbox(label="Results"), gr.Textbox(label="Debug Output")],
175
  title="Gematria Search in Tanach",
176
- description="Search for phrases in Tanach that have the same gematria value as the entered phrase.",
177
- live=False, # Disable live update
178
- allow_flagging="never" # Disable flagging
179
- )
 
180
 
181
  if __name__ == "__main__":
182
- initialize_database()
183
- run_test() # Run tests
184
- iface.launch()
 
3
  import re
4
  import sqlite3
5
  import logging
6
+ from collections import defaultdict
7
  from util import process_json_files
8
  from gematria import calculate_gematria
9
  from deep_translator import GoogleTranslator, exceptions
10
 
11
+ # Set up logging
12
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
13
+
14
+ # Global variables for database connection and translator
15
+ conn = None
16
+ translator = None
17
 
18
  def flatten_text(text):
19
+ """Helper function to flatten nested lists into a single list."""
20
+ if isinstance(text, list):
21
+ return " ".join(flatten_text(item) if isinstance(item, list) else item for item in text)
22
+ return text
23
 
24
  def initialize_database():
25
+ """Initializes the SQLite database."""
26
+ global conn
27
+ conn = sqlite3.connect('gematria.db')
28
+ c = conn.cursor()
29
+ c.execute('''
30
+ CREATE TABLE IF NOT EXISTS results (
31
+ gematria_sum INTEGER,
32
+ words TEXT UNIQUE,
33
+ translation TEXT,
34
+ occurrences TEXT,
35
+ PRIMARY KEY (words)
36
+ )
37
+ ''')
38
+ c.execute('''
39
+ CREATE TABLE IF NOT EXISTS processed_books (
40
+ book INTEGER PRIMARY KEY,
41
+ max_phrase_length INTEGER
42
+ )
43
+ ''')
44
+ conn.commit()
45
+ logging.info("Database initialized.")
46
+
47
+ def initialize_translator():
48
+ """Initializes the Google Translator."""
49
+ global translator
50
+ translator = GoogleTranslator(source='iw', target='en')
51
+ logging.info("Translator initialized.")
52
+
53
+ def insert_phrase_to_db(gematria_sum, phrase_candidate, translation, occurrence):
54
+ """Inserts a phrase and its Gematria value into the database."""
55
+ global conn
56
+ c = conn.cursor()
57
+ try:
58
  c.execute('''
59
+ INSERT INTO results (gematria_sum, words, translation, occurrences)
60
+ VALUES (?, ?, ?, ?)
61
+ ''', (gematria_sum, phrase_candidate, translation, occurrence))
62
+ conn.commit()
63
+ logging.debug(f"Inserted phrase: {phrase_candidate} (Gematria: {gematria_sum})")
64
+ except sqlite3.IntegrityError:
65
+ logging.debug(f"Phrase already exists: {phrase_candidate} (Gematria: {gematria_sum})")
66
+ c.execute('''
67
+ SELECT occurrences FROM results WHERE words = ?
68
+ ''', (phrase_candidate,))
69
+ existing_occurrences = c.fetchone()[0]
70
+ updated_occurrences = existing_occurrences + ';' + occurrence
71
+ c.execute('''
72
+ UPDATE results
73
+ SET occurrences = ?
74
+ WHERE words = ?
75
+ ''', (updated_occurrences, phrase_candidate))
76
  conn.commit()
 
77
 
78
+ def populate_database(tanach_texts, max_phrase_length=3):
79
+ """Populates the database with phrases from the Tanach and their Gematria values."""
80
+ global conn
81
+ logging.info("Populating database...")
82
+ c = conn.cursor()
83
+ for book_id, text in enumerate(tanach_texts):
84
+ c.execute('''SELECT max_phrase_length FROM processed_books WHERE book = ?''', (book_id + 1,))
85
+ result = c.fetchone()
86
+ if result and result[0] >= max_phrase_length:
87
+ logging.info(f"Skipping book {book_id+1}: Already processed with max_phrase_length {result[0]}")
88
+ continue
89
+
90
+ logging.info(f"Processing book {book_id+1} with max_phrase_length {max_phrase_length}")
91
+ if 'text' not in text or not isinstance(text['text'], list):
92
+ logging.warning(f"Skipping book {book_id} due to missing or invalid 'text' field.")
93
+ continue
94
+ title = text.get('title', 'Unknown')
95
+ chapters = text['text']
96
+ for chapter_id, chapter in enumerate(chapters):
97
+ if not isinstance(chapter, list):
98
+ logging.warning(f"Skipping chapter {chapter_id} in book {title} due to invalid format.")
99
+ continue
100
+ for verse_id, verse in enumerate(chapter):
101
+ verse_text = flatten_text(verse)
102
+ verse_text = re.sub(r"[^\u05D0-\u05EA ]+", "", verse_text)
103
+ verse_text = re.sub(r" +", " ", verse_text)
104
+ words = verse_text.split()
105
+ for length in range(1, max_phrase_length + 1):
106
+ for start in range(len(words) - length + 1):
107
+ phrase_candidate = " ".join(words[start:start + length])
108
+ gematria_sum = calculate_gematria(phrase_candidate.replace(" ", ""))
109
+ occurrence = f"{book_id+1}:{title}:{chapter_id+1}:{verse_id+1}"
110
+ insert_phrase_to_db(gematria_sum, phrase_candidate, None, occurrence) # No translation initially
111
  try:
112
+ c.execute('''INSERT INTO processed_books (book, max_phrase_length) VALUES (?, ?)''', (book_id + 1, max_phrase_length))
 
 
 
 
113
  except sqlite3.IntegrityError:
114
+ c.execute('''UPDATE processed_books SET max_phrase_length = ? WHERE book = ?''', (max_phrase_length, book_id + 1))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  conn.commit()
116
+ logging.info("Database population complete.")
117
+
118
+ def get_translation(phrase):
119
+ """Retrieves or generates the English translation of a Hebrew phrase."""
120
+ global translator, conn
121
+ c = conn.cursor()
122
+ c.execute('''
123
+ SELECT translation FROM results
124
+ WHERE words = ?
125
+ ''', (phrase,))
126
+ result = c.fetchone()
127
+ if result and result[0]:
128
+ return result[0]
129
+ else:
130
+ translation = translate_and_store(phrase)
131
  c.execute('''
132
+ UPDATE results
133
+ SET translation = ?
134
+ WHERE words = ?
135
+ ''', (translation, phrase))
136
+ conn.commit()
137
+ return translation
138
+
139
+
140
+ def translate_and_store(phrase):
141
+ """Translates a phrase using Google Translate."""
142
+ global translator
143
+ try:
144
+ translation = translator.translate(phrase)
145
+ logging.debug(f"Translated phrase: {translation}")
146
+ return translation
147
+ except (exceptions.TranslationNotFound, exceptions.NotValidPayload,
148
+ exceptions.ServerException, exceptions.RequestError) as e:
149
+ logging.error(f"Error translating phrase '{phrase}': {e}")
150
+ return "[Translation Error]"
151
+
152
+ def search_gematria_in_db(gematria_sum):
153
+ """Searches the database for phrases with a given Gematria value."""
154
+ global conn
155
+ c = conn.cursor()
156
+ c.execute('''
157
+ SELECT words, occurrences FROM results WHERE gematria_sum = ?
158
+ ''', (gematria_sum,))
159
+ results = c.fetchall()
160
+ logging.debug(f"Found {len(results)} matching phrases for Gematria: {gematria_sum}")
161
+ return results
162
 
163
  def gematria_search_interface(phrase):
164
+ """The main function for the Gradio interface."""
165
+ if not phrase.strip():
166
+ return "Please enter a phrase."
167
+
168
+ # Datenbankverbindung innerhalb der Funktion erstellen
169
+ global conn
170
+ conn = sqlite3.connect('gematria.db')
171
+ c = conn.cursor()
172
+
173
+ phrase_gematria = calculate_gematria(phrase.replace(" ", ""))
174
+ logging.info(f"Searching for phrases with Gematria: {phrase_gematria}")
175
+
176
+ matching_phrases = search_gematria_in_db(phrase_gematria)
177
+ if not matching_phrases:
178
+ return "No matching phrases found."
179
+
180
+ # Format results for display
181
+ results = []
182
+ for words, occurrences in matching_phrases:
183
+ translation = get_translation(words)
184
+ for occurrence in occurrences.split(';'):
185
+ book, title, chapter, verse = occurrence.split(':')
186
+ results.append(f"Book: {title} ({book})\nChapter: {chapter}, Verse: {verse}\nPhrase: {words}\nTranslation: {translation}\n\n")
187
+
188
+ conn.close()
189
+ return "\n".join(results)
190
+
191
+ def run_app():
192
+ """Initializes and launches the Gradio app."""
193
+ initialize_database()
194
+ initialize_translator()
195
+
196
+ # Pre-populate the database
197
+ tanach_texts = process_json_files(1, 39)
198
+ populate_database(tanach_texts)
199
+
200
+ iface = gr.Interface(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  fn=gematria_search_interface,
202
  inputs=gr.Textbox(label="Enter phrase"),
203
+ outputs=gr.Textbox(label="Results"),
204
  title="Gematria Search in Tanach",
205
+ description="Search for phrases in the Tanach that have the same Gematria value.",
206
+ live=False,
207
+ allow_flagging="never"
208
+ )
209
+ iface.launch()
210
 
211
  if __name__ == "__main__":
212
+ run_app()
 
 
gematria.db CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a70de31a4b4d75fe746acc35d7567274615330a7e98872ee123f9cff3a51b38c
3
- size 44244992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fcf5c1564217654d09ee0f57dc0025114c5df51a352f829932e86a10570ce09
3
+ size 47960064