|
import sqlite3 |
|
import logging |
|
from deep_translator import GoogleTranslator, exceptions |
|
from tqdm import tqdm |
|
import threading |
|
import time |
|
from queue import Queue |
|
|
|
|
|
DATABASE_FILE = 'gematria.db' |
|
BATCH_SIZE = 1000 |
|
NUM_THREADS = 10 |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
|
|
translator = GoogleTranslator(source='yi', target='en') |
|
logging.info("Translator initialized.") |
|
|
|
|
|
translation_queue = Queue() |
|
translation_queue_tqdm = tqdm(total=0, dynamic_ncols=True, desc="Translation Queue") |
|
total_translations_tqdm = tqdm(total=0, dynamic_ncols=True, desc="Total Translations") |
|
|
|
|
|
db_lock = threading.Lock() |
|
translations_completed = 0 |
|
|
|
|
|
def translate_and_store(phrase: str) -> str: |
|
"""Translates a Hebrew phrase to English using Google Translate.""" |
|
global translator |
|
max_retries = 3 |
|
retries = 0 |
|
while retries < max_retries: |
|
try: |
|
translation = translator.translate(phrase) |
|
return translation |
|
except (exceptions.TranslationNotFound, exceptions.NotValidPayload, |
|
exceptions.ServerException, exceptions.RequestError) as e: |
|
retries += 1 |
|
logging.warning(f"Error translating phrase '{phrase}': {e}. Retrying... ({retries}/{max_retries})") |
|
logging.error(f"Failed to translate phrase '{phrase}' after {max_retries} retries.") |
|
return None |
|
|
|
|
|
def translation_worker(): |
|
"""Worker thread to process translations from the queue.""" |
|
global conn, translator, translation_queue, db_lock, translation_queue_tqdm, translations_completed, total_translations_tqdm |
|
|
|
while True: |
|
phrase = translation_queue.get() |
|
translation_queue_tqdm.update() |
|
if phrase is None: |
|
break |
|
|
|
translation = translate_and_store(phrase) |
|
|
|
|
|
with db_lock: |
|
with sqlite3.connect(DATABASE_FILE) as conn: |
|
cursor = conn.cursor() |
|
if translation is not None: |
|
cursor.execute("UPDATE results SET translation = ? WHERE words = ?", (translation, phrase)) |
|
translations_completed += 1 |
|
total_translations_tqdm.update() |
|
conn.commit() |
|
|
|
translation_queue.task_done() |
|
|
|
|
|
def populate_translations(): |
|
"""Populates translations for all Hebrew phrases in the database.""" |
|
global conn, translator, translation_queue, translation_queue_tqdm, total_translations_tqdm |
|
|
|
with sqlite3.connect(DATABASE_FILE) as conn: |
|
cursor = conn.cursor() |
|
|
|
|
|
cursor.execute("SELECT COUNT(DISTINCT words) FROM results WHERE translation IS NULL") |
|
total_phrases = cursor.fetchone()[0] |
|
|
|
logging.info(f"Found {total_phrases} distinct phrases to translate.") |
|
|
|
|
|
cursor.execute("SELECT DISTINCT words FROM results WHERE translation IS NULL") |
|
phrases_generator = (phrase for phrase, in cursor) |
|
|
|
|
|
translation_queue_tqdm.total = total_phrases |
|
total_translations_tqdm.total = total_phrases |
|
|
|
|
|
for phrase in phrases_generator: |
|
translation_queue.put(phrase) |
|
translation_queue_tqdm.update() |
|
|
|
|
|
translation_queue_tqdm.close() |
|
|
|
|
|
threads = [] |
|
for _ in range(NUM_THREADS): |
|
thread = threading.Thread(target=translation_worker) |
|
thread.start() |
|
threads.append(thread) |
|
|
|
|
|
translation_queue.join() |
|
|
|
|
|
for _ in range(NUM_THREADS): |
|
translation_queue.put(None) |
|
for thread in threads: |
|
thread.join() |
|
|
|
logging.info("All translations completed.") |
|
|
|
|
|
|
|
def save_translations_periodically(): |
|
"""Saves translations to the database every minute.""" |
|
while True: |
|
time.sleep(60) |
|
logging.info("Saving translations to the database...") |
|
with db_lock: |
|
with sqlite3.connect(DATABASE_FILE) as conn: |
|
conn.commit() |
|
logging.info("Translations saved.") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
translation_thread = threading.Thread(target=populate_translations) |
|
translation_thread.start() |
|
|
|
|
|
save_thread = threading.Thread(target=save_translations_periodically) |
|
save_thread.start() |
|
|
|
|
|
while True: |
|
time.sleep(1) |