gematria_date_sums

Sleeping

File size: 13,130 Bytes

import gradio as gr
import json
import re
import sqlite3
import logging
from collections import defaultdict
from typing import Tuple, Dict, List

from util import process_json_files
from gematria import calculate_gematria
from deep_translator import GoogleTranslator, exceptions
from urllib.parse import quote_plus
from tqdm import tqdm # Import tqdm for progress bars

# Constants
DATABASE_FILE = 'gematria.db'
MAX_PHRASE_LENGTH_LIMIT = 20 # Populate database for phrases up to 5 words
BATCH_SIZE = 1000 # Insert phrases into database in batches

# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(filename)s - %(lineno)d - %(message)s')

# Global variables
conn: sqlite3.Connection = None
translator: GoogleTranslator = None
book_names: Dict[int, str] = {}
gematria_cache: Dict[Tuple[int, int], List[Tuple[str, str, int, int]]] = {}
translation_cache: Dict[str, str] = {}

def initialize_database() -> None:
  """Initializes the SQLite database."""
  global conn
  conn = sqlite3.connect(DATABASE_FILE, isolation_level=None) # Autocommit for faster insertion
  cursor = conn.cursor()

  # Create tables if they don't exist
  cursor.execute('''
  CREATE TABLE IF NOT EXISTS results (
    gematria_sum INTEGER,
    words TEXT,
    translation TEXT,
    book TEXT,
    chapter INTEGER,
    verse INTEGER,
    PRIMARY KEY (gematria_sum, words, book, chapter, verse)
  )
  ''')
  cursor.execute('''
  CREATE TABLE IF NOT EXISTS processed_books (
    book TEXT PRIMARY KEY,
    max_phrase_length INTEGER
  )
  ''')
  cursor.execute('''
  CREATE TABLE IF NOT EXISTS translations (
    hebrew_phrase TEXT PRIMARY KEY,
    english_translation TEXT
  )
  ''')

def initialize_translator() -> None:
  """Initializes the Google Translator."""
  global translator
  translator = GoogleTranslator(source='iw', target='en')
  logging.info("Translator initialized.")

def populate_database(start_book: int, end_book: int, max_phrase_length: int = 1) -> None:
  """Populates the database with phrases from the Tanach and their Gematria values."""
  global conn, book_names
  logging.info(f"Populating database with books from {start_book} to {end_book}...")
  cursor = conn.cursor()

  for book_id in tqdm(range(start_book, end_book + 1), desc="Processing Books"):
    book_data = process_json_files(book_id, book_id) # Get data for the single book

    # process_json_files returns a dictionary with book_id as key,
    # so access the book data directly
    if book_id in book_data:
      book_data = book_data[book_id]
      if 'title' not in book_data or not isinstance(book_data['title'], str):
        logging.warning(f"Skipping book {book_id} due to missing or invalid 'title' field.")
        continue

      title = book_data['title']
      book_names[book_id] = title

      # Check if the book is already processed for this max_phrase_length
      cursor.execute('''SELECT max_phrase_length FROM processed_books WHERE book = ?''', (title,))
      result = cursor.fetchone()
      if result and result[0] >= max_phrase_length:
        logging.info(f"Skipping book {title}: Already processed with max_phrase_length {result[0]}")
        continue

      logging.info(f"Processing book {title} with max_phrase_length {max_phrase_length}")

      if 'text' not in book_data or not isinstance(book_data['text'], list):
        logging.warning(f"Skipping book {book_id} due to missing or invalid 'text' field.")
        continue

      chapters = book_data['text']
      # Faster iteration with enumerate and list comprehension
      for chapter_id, chapter in enumerate(chapters):
        for verse_id, verse in enumerate(chapter):
          verse_text = flatten_text(verse)
          # Remove text in square brackets and non-Hebrew characters
          verse_text = re.sub(r'\[.*?\]', '', verse_text)
          verse_text = re.sub(r"[^\u05D0-\u05EA ]+", "", verse_text)
          verse_text = re.sub(r" +", " ", verse_text)
          words = verse_text.split()

          # Use a generator to avoid building large lists in memory
          for length in range(1, max_phrase_length + 1):
            for start in range(len(words) - length + 1):
              phrase_candidate = " ".join(words[start:start + length])
              gematria_sum = calculate_gematria(phrase_candidate.replace(" ", ""))
              yield gematria_sum, phrase_candidate, title, chapter_id + 1, verse_id + 1

      # Mark the book as processed with the current max_phrase_length
      cursor.execute('''
      INSERT OR REPLACE INTO processed_books (book, max_phrase_length)
      VALUES (?, ?)
      ''', (title, max_phrase_length))

def insert_phrases_to_db(phrases: List[Tuple[int, str, str, int, int]]) -> None:
  """Inserts a list of phrases into the database efficiently."""
  global conn
  cursor = conn.cursor()

  # Use executemany to insert multiple rows at once
  cursor.executemany('''
  INSERT OR IGNORE INTO results (gematria_sum, words, book, chapter, verse)
  VALUES (?, ?, ?, ?, ?)
  ''', phrases)

  # Commit the changes outside the loop for better performance
  conn.commit()

def get_translation(phrase: str) -> str:
  """Retrieves or generates the English translation of a Hebrew phrase."""
  global translator, conn, translation_cache
  if phrase in translation_cache:
    return translation_cache[phrase]
  else:
    cursor = conn.cursor()
    cursor.execute('''
    SELECT english_translation FROM translations
    WHERE hebrew_phrase = ?
    ''', (phrase,))
    result = cursor.fetchone()
    if result and result[0]:
      translation = result[0]
      return translation
    else:
      translation = translate_and_store(phrase)
      cursor.execute('''
      INSERT OR IGNORE INTO translations (hebrew_phrase, english_translation)
      VALUES (?, ?)
      ''', (phrase, translation))
      return translation

def translate_and_store(phrase: str) -> str:
  """Translates a Hebrew phrase to English using Google Translate and handles potential errors."""
  global translator
  max_retries = 3
  retries = 0

  while retries < max_retries:
    try:
      translation = translator.translate(phrase)
      logging.debug(f"Translated phrase: {translation}")
      return translation
    except (exceptions.TranslationNotFound, exceptions.NotValidPayload,
         exceptions.ServerException, exceptions.RequestError, requests.exceptions.ConnectionError) as e:
      retries += 1
      logging.warning(f"Error translating phrase '{phrase}': {e}. Retrying... ({retries}/{max_retries})")

  logging.error(f"Failed to translate phrase '{phrase}' after {max_retries} retries.")
  return "[Translation Error]"

def search_gematria_in_db(gematria_sum: int, max_words: int) -> List[Tuple[str, str, int, int]]:
  """Searches the database for phrases with a given Gematria value and word count.
    Returns phrases with word count <= max_words."""
  global conn
  cursor = conn.cursor()
  logging.debug(f"Searching for phrases with Gematria: {gematria_sum} and max words: {max_words}")
  cursor.execute('''
  SELECT words, book, chapter, verse FROM results WHERE gematria_sum = ?
  ''', (gematria_sum,)) # Retrieve all matching phrases first
  results = cursor.fetchall()
  filtered_results = []
  logging.debug(f"Found {len(results)} matching phrases before filtering.")
  for words, book, chapter, verse in results:
    # Filter by word count (including phrases with fewer words)
    word_count = len(words.split()) # Correctly split and count words
    logging.debug(f"Word count for '{words}': {word_count}")
    if word_count <= max_words: # Include phrases with word count <= max_words
      filtered_results.append((words, book, chapter, verse))
  logging.debug(f"Found {len(filtered_results)} matching phrases after filtering.")
  return filtered_results

def gematria_search_interface(phrase: str, max_words: int, show_translation: bool) -> str:
  """The main function for the Gradio interface."""
  if not phrase.strip():
    return "Please enter a phrase."

  global conn, book_names, gematria_cache
  conn = sqlite3.connect(DATABASE_FILE)
  cursor = conn.cursor()

  # Extract numbers from the input text
  numbers = re.findall(r'\d+', phrase)
  # Calculate Gematria for the remaining text (non-numbers)
  text_without_numbers = re.sub(r'\d+', '', phrase)
  phrase_gematria = calculate_gematria(text_without_numbers.replace(" ", ""))

  # Add sum of numbers to Gematria
  phrase_gematria += sum(int(number) for number in numbers)

  logging.info(f"Searching for phrases with Gematria: {phrase_gematria}")

  # Debugging output
  logging.debug(f"Phrase Gematria: {phrase_gematria}")
  logging.debug(f"Max Words: {max_words}")

  # Check if Gematria is in cache for the specific max_words value
  if (phrase_gematria, max_words) in gematria_cache:
    matching_phrases = gematria_cache[(phrase_gematria, max_words)]
    logging.debug(f"Retrieved matching phrases from cache for max_words: {max_words}.")
  else:
    # Search in the database
    matching_phrases = search_gematria_in_db(phrase_gematria, max_words)
    # Cache the results with the max_words value
    gematria_cache[(phrase_gematria, max_words)] = matching_phrases
    logging.debug(f"Retrieved matching phrases from database for max_words: {max_words}.")

  if not matching_phrases:
    return "No matching phrases found."

  # Sort results by book, chapter, and verse
  sorted_phrases = sorted(matching_phrases, key=lambda x: (int(list(book_names.keys())[list(book_names.values()).index(x[1])]), x[2], x[3]))
  logging.debug(f"Sorted matching phrases: {sorted_phrases}")

  # Group results by book
  results_by_book = defaultdict(list)
  for words, book, chapter, verse in sorted_phrases:
    results_by_book[book].append((words, chapter, verse))
  logging.debug(f"Grouped results by book: {results_by_book}")

  # Format results for display
  results = []
  results.append("<div class='results-container'>")
  for book, phrases in results_by_book.items():
    results.append(f"<h4>Book: {book}</h4>") # Directly display book name
    for words, chapter, verse in phrases:
      translation = get_translation(words) if show_translation else ""
      link = f"https://www.biblegateway.com/passage/?search={quote_plus(book)}+{chapter}%3A{verse}&version=CJB"
      results.append(f"""
      <div class='result-item'>
        <p>Chapter: {chapter}, Verse: {verse}</p>
        <p class='hebrew-phrase'>Hebrew Phrase: {words}</p>
        <p>Translation: {translation}</p>
        <a href='{link}' target='_blank' class='bible-link'>[See on Bible Gateway]</a>
      </div>
      """)
  results.append("</div>") # Close results-container div

  conn.close()

  # Add CSS styling
  style = """
  <style>
    .results-container {
      display: grid;
      grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
      gap: 20px;
    }

    .result-item {
      border: 1px solid #ccc;
      padding: 15px;
      border-radius: 5px;
      box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.1);
    }

    .hebrew-phrase {
      font-family: 'SBL Hebrew', 'Ezra SIL', serif;
      direction: rtl;
    }

    .bible-link {
      display: block;
      margin-top: 10px;
      color: #007bff;
      text-decoration: none;
    }
  </style>
  """

  return style + "\n".join(results)

def flatten_text(text: List) -> str:
  """Helper function to flatten nested lists into a single list."""
  if isinstance(text, list):
    return " ".join(flatten_text(item) if isinstance(item, list) else item for item in text)
  return text

def run_app() -> None:
  """Initializes and launches the Gradio app."""
  initialize_database()
  initialize_translator()

  # Pre-populate the database
  logging.info("Starting database population...")
  phrases_to_insert = [] # Collect phrases before inserting in bulk
  for max_phrase_length in range(1, MAX_PHRASE_LENGTH_LIMIT + 1): # Populate for phrases up to MAX_PHRASE_LENGTH_LIMIT words
    for gematria_sum, phrase, book, chapter, verse in tqdm(populate_database(1, 39, max_phrase_length=max_phrase_length), desc=f"Populating Database (Max Length: {max_phrase_length})"): # Books 1 to 39
      phrases_to_insert.append((gematria_sum, phrase, book, chapter, verse))
      if len(phrases_to_insert) >= BATCH_SIZE: # Insert in batches of BATCH_SIZE for efficiency
        insert_phrases_to_db(phrases_to_insert)
        phrases_to_insert = []
    if phrases_to_insert: # Insert remaining phrases
      insert_phrases_to_db(phrases_to_insert)
  logging.info("Database population complete.")

  iface = gr.Interface(
    fn=gematria_search_interface,
    inputs=[
      gr.Textbox(label="Enter word(s) or numbers (e.g., 'abc', '888' or 'abc 111 777')"),
      gr.Number(label="Max Word Count in Result Phrases", value=1, minimum=1, maximum=MAX_PHRASE_LENGTH_LIMIT),
      gr.Checkbox(label="Show Translation", value=True)
    ],
    outputs=gr.HTML(label="Results"),
    title="Gematria Search in Tanach",
    description="Search for phrases and/or numbers in the Tanach that have the same Gematria value.",
    live=False,
    allow_flagging="never"
  )
  iface.launch()

if __name__ == "__main__":
  run_app()