Spaces:
Running
Running
import json | |
import os | |
import logging | |
import sqlite3 | |
from typing import Dict, List, Any | |
logger = logging.getLogger(__name__) | |
def process_quran_files(start: int, end: int) -> Dict[int, Dict[str, Any]]: | |
""" | |
Processes Quran JSON files and returns a dictionary mapping sura IDs to their data. | |
Args: | |
start: The starting sura ID (inclusive). | |
end: The ending sura ID (inclusive). | |
Returns: | |
A dictionary where keys are sura IDs and values are dictionaries | |
containing 'name' and 'text' fields. | |
""" | |
base_path = "texts/quran" | |
results = {} | |
for i in range(start, end + 1): | |
file_name = f"{base_path}/{i:03d}.json" | |
try: | |
with open(file_name, 'r', encoding='utf-8') as file: | |
data = json.load(file) | |
if data: | |
# Extract name and verses | |
name = data.get("name", "No title") | |
verses = data.get("verse", {}) | |
text = [verses[key] for key in sorted(verses.keys())] | |
# Store sura ID as key and sura data as value | |
results[i] = {"name": name, "text": text} | |
except FileNotFoundError: | |
logger.warning(f"File {file_name} not found.") | |
except json.JSONDecodeError as e: | |
logger.warning(f"File {file_name} could not be read as JSON: {e}") | |
except KeyError as e: | |
logger.warning(f"Expected key 'verse' is missing in {file_name}: {e}") | |
return results | |
def find_shortest_sura_match(gematria_sum: int, db_file: str = 'abjad.db') -> Dict[str, Any]: | |
""" | |
Finds the shortest Quran sura verse in abjad.db. | |
Args: | |
gematria_sum: The gematria value to search for | |
db_file: The database file to search in | |
Returns: | |
A dictionary containing the matched verse information or None if no match is found | |
""" | |
logger.debug(f"Entering find_shortest_sura_match with gematria_sum: {gematria_sum}") | |
with sqlite3.connect(db_file) as conn: | |
cursor = conn.cursor() | |
# First check if there are any Quran entries in the database | |
cursor.execute(''' | |
SELECT COUNT(*) | |
FROM results | |
WHERE book != 'Psalms' | |
''') | |
count = cursor.fetchone()[0] | |
if count == 0: | |
logger.warning("No Quran entries found in database. Run initialize_quran_db.py first.") | |
return None | |
# Search for a match, prioritizing shorter phrases | |
cursor.execute(''' | |
SELECT words, book, chapter, verse, phrase_length, word_position | |
FROM results | |
WHERE gematria_sum = ? AND book != 'Psalms' | |
ORDER BY phrase_length ASC, LENGTH(words) ASC | |
LIMIT 1 | |
''', (gematria_sum,)) | |
result = cursor.fetchone() | |
if result: | |
logger.debug(f"Shortest sura match found: {result}") | |
return { | |
"words": result[0], | |
"book": result[1], | |
"chapter": result[2], | |
"verse": result[3], | |
"phrase_length": result[4], | |
"word_position": result[5] if len(result) > 5 else None | |
} | |
# If no exact match, try to find the closest match | |
# This is similar to how quran_network handles it | |
cursor.execute(''' | |
SELECT gematria_sum, ABS(gematria_sum - ?) as diff | |
FROM results | |
WHERE book != 'Psalms' | |
GROUP BY gematria_sum | |
ORDER BY diff ASC | |
LIMIT 1 | |
''', (gematria_sum,)) | |
closest = cursor.fetchone() | |
if closest: | |
closest_gematria = closest[0] | |
logger.debug(f"No exact match found. Closest gematria: {closest_gematria}") | |
# Find the shortest verse with this gematria | |
cursor.execute(''' | |
SELECT words, book, chapter, verse, phrase_length, word_position | |
FROM results | |
WHERE gematria_sum = ? AND book != 'Psalms' | |
ORDER BY phrase_length ASC, LENGTH(words) ASC | |
LIMIT 1 | |
''', (closest_gematria,)) | |
result = cursor.fetchone() | |
if result: | |
logger.debug(f"Closest sura match found: {result}") | |
return { | |
"words": result[0], | |
"book": result[1], | |
"chapter": result[2], | |
"verse": result[3], | |
"phrase_length": result[4], | |
"word_position": result[5] if len(result) > 5 else None | |
} | |
logger.debug("No matching sura found.") | |
return None | |
def create_quran_display_iframe(sura_name: str, chapter: int, verse: int) -> str: | |
"""Creates an iframe HTML string for displaying a Quran verse.""" | |
logger.debug(f"Creating Quran display iframe for sura: {sura_name}, chapter: {chapter}, verse: {verse}") | |
# Use surahquran.com URL format | |
url = f"https://surahquran.com/aya-{verse}-sora-{chapter}.html" | |
iframe = f'<iframe src="{url}" width="800" height="600"></iframe>' | |
logger.debug(f"Generated iframe: {iframe}") | |
return iframe | |
def get_sura_count() -> int: | |
"""Returns the total number of suras in the Quran.""" | |
base_path = "texts/quran" | |
# Count the number of JSON files in the quran directory | |
try: | |
files = [f for f in os.listdir(base_path) if f.endswith('.json')] | |
return len(files) | |
except FileNotFoundError: | |
logger.error(f"Directory {base_path} not found.") | |
return 114 # Default number of suras in the Quran | |
def get_first_els_result_quran(gematria_sum: int, tlang: str = "en") -> Dict[str, Any]: | |
""" | |
Gets the first ELS result from the Quran using the gematria sum as the step. | |
Args: | |
gematria_sum: The gematria value to use as the ELS step | |
tlang: Target language for results | |
Returns: | |
The first ELS result found or None | |
""" | |
import hashlib | |
import json | |
from gematria import strip_diacritics | |
logger.debug(f"Entering get_first_els_result_quran with gematria_sum: {gematria_sum}, tlang: {tlang}") | |
# Create a cache key | |
cache_key = f"quran_els_{gematria_sum}_{tlang}" | |
cache_file = "els_cache.db" | |
# Check cache first | |
try: | |
with sqlite3.connect(cache_file) as conn: | |
cursor = conn.cursor() | |
cursor.execute( | |
"SELECT results FROM els_cache WHERE query_hash = ?", | |
(hashlib.sha256(cache_key.encode()).hexdigest(),)) | |
result = cursor.fetchone() | |
if result: | |
logger.info(f"Cache hit for Quran ELS query: {cache_key}") | |
return json.loads(result[0]) | |
except sqlite3.Error as e: | |
logger.error(f"Database error checking cache: {e}") | |
# Cache miss, perform ELS search | |
logger.info(f"Cache miss for Quran ELS query: {cache_key}, performing search") | |
# Load all Quran text | |
sura_count = get_sura_count() | |
quran_data = process_quran_files(1, sura_count) | |
# Concatenate all verses from all suras into a single text | |
all_text = "" | |
for sura_id, sura_info in sorted(quran_data.items()): | |
# Add a space between suras to prevent cross-sura word formation | |
if all_text: | |
all_text += " " | |
# Add all verses from this sura | |
verses = sura_info['text'] | |
all_text += " ".join(verses) | |
# Clean up the text: strip diacritics, remove any special characters, etc. | |
clean_text = strip_diacritics(all_text) | |
clean_text = ''.join(c for c in clean_text if c.isalpha() or c.isspace()) | |
# Perform ELS search with the gematria_sum as the step | |
result = None | |
if clean_text: | |
# Remove spaces for ELS search | |
text_no_spaces = clean_text.replace(" ", "") | |
# Track character positions to their original sura/verse | |
char_map = [] # List of (sura_id, verse_id) for each character | |
# Build character position mapping | |
current_pos = 0 | |
for sura_id, sura_info in sorted(quran_data.items()): | |
sura_name = sura_info['name'] | |
verses = sura_info['text'] | |
for verse_idx, verse in enumerate(verses, 1): | |
cleaned_verse = strip_diacritics(verse).replace(" ", "") | |
for _ in cleaned_verse: | |
if current_pos < len(text_no_spaces): | |
char_map.append((sura_id, sura_name, verse_idx)) | |
current_pos += 1 | |
# Start positions to try (we'll try the first 100 positions for better coverage) | |
for start_pos in range(min(100, len(text_no_spaces))): | |
# Extract characters at positions: start_pos, start_pos+step, start_pos+2*step, etc. | |
extracted = "" | |
positions = [] | |
pos = start_pos | |
# Extract up to 7 characters (typical ELS result length) | |
for _ in range(7): | |
if pos < len(text_no_spaces): | |
extracted += text_no_spaces[pos] | |
positions.append(pos) | |
pos += gematria_sum | |
else: | |
break | |
if len(extracted) >= 3: # At least 3 characters | |
# Look up the sura/verse for the first and last character | |
first_pos = positions[0] | |
last_pos = positions[-1] | |
if first_pos < len(char_map) and last_pos < len(char_map): | |
first_loc = char_map[first_pos] | |
last_loc = char_map[last_pos] | |
result = { | |
"result_text": extracted, | |
"source": "Quran", | |
"start_position": start_pos, | |
"step": gematria_sum, | |
"start_sura": first_loc[0], | |
"start_sura_name": first_loc[1], | |
"start_verse": first_loc[2], | |
"end_sura": last_loc[0], | |
"end_sura_name": last_loc[1], | |
"end_verse": last_loc[2], | |
"positions": positions | |
} | |
break # Found a result, stop searching | |
else: | |
logger.warning(f"Character position mapping inconsistency: {first_pos}, {last_pos} vs {len(char_map)}") | |
continue | |
# Cache the result | |
if result: | |
try: | |
with sqlite3.connect(cache_file) as conn: | |
cursor = conn.cursor() | |
# Make sure the table exists | |
cursor.execute(''' | |
CREATE TABLE IF NOT EXISTS els_cache ( | |
query_hash TEXT PRIMARY KEY, | |
function_name TEXT, | |
args TEXT, | |
kwargs TEXT, | |
results TEXT | |
) | |
''') | |
cursor.execute( | |
"INSERT OR REPLACE INTO els_cache (query_hash, function_name, args, kwargs, results) VALUES (?, ?, ?, ?, ?)", | |
(hashlib.sha256(cache_key.encode()).hexdigest(), "get_first_els_result_quran", | |
json.dumps([gematria_sum]), json.dumps({"tlang": tlang}), json.dumps(result))) | |
conn.commit() | |
logger.debug("Cached Quran ELS results in database.") | |
except sqlite3.Error as e: | |
logger.error(f"Database error caching results: {e}") | |
logger.debug(f"Exiting get_first_els_result_quran, returning: {result}") | |
return result | |
def initialize_quran_database(db_file: str = 'abjad.db', max_phrase_length: int = 1): | |
""" | |
Initializes the abjad database with Quran verses. | |
This function processes all Quran JSON files and adds their gematria values to the database. | |
Args: | |
db_file: The SQLite database file to use | |
max_phrase_length: Maximum phrase length to process | |
""" | |
from gematria import calculate_gematria, strip_diacritics | |
from tqdm import tqdm # Import tqdm for progress bars | |
logger.info(f"Initializing Quran database: {db_file}") | |
# Create the database if it doesn't exist | |
with sqlite3.connect(db_file) as conn: | |
cursor = conn.cursor() | |
# Create results table | |
cursor.execute(''' | |
CREATE TABLE IF NOT EXISTS results ( | |
gematria_sum INTEGER, | |
words TEXT, | |
translation TEXT, | |
book TEXT, | |
chapter INTEGER, | |
verse INTEGER, | |
phrase_length INTEGER, | |
word_position TEXT, | |
PRIMARY KEY (gematria_sum, words, book, chapter, verse, word_position) | |
) | |
''') | |
cursor.execute(''' | |
CREATE INDEX IF NOT EXISTS idx_results_gematria | |
ON results (gematria_sum) | |
''') | |
# Create processed_books table to track processing | |
cursor.execute(''' | |
CREATE TABLE IF NOT EXISTS processed_books ( | |
book TEXT PRIMARY KEY, | |
max_phrase_length INTEGER | |
) | |
''') | |
conn.commit() | |
# Process all Quran files | |
sura_count = get_sura_count() | |
logger.info(f"Found {sura_count} suras to process") | |
# Global counter for word position tracking | |
total_word_count = 0 | |
book_names = {} | |
with sqlite3.connect(db_file) as conn: | |
cursor = conn.cursor() | |
# Process each sura (book) | |
for sura_id in tqdm(range(1, sura_count + 1), desc="Processing Suras"): | |
# Load sura data | |
sura_data = process_quran_files(sura_id, sura_id) | |
if sura_id in sura_data: | |
sura_info = sura_data[sura_id] | |
sura_name = sura_info['name'] | |
book_names[sura_id] = sura_name | |
# Check if this sura has already been processed | |
cursor.execute('''SELECT max_phrase_length FROM processed_books WHERE book = ?''', (sura_name,)) | |
result = cursor.fetchone() | |
if result and result[0] >= max_phrase_length: | |
logger.info(f"Skipping sura {sura_name}: Already processed with max_phrase_length {result[0]}") | |
continue | |
verses = sura_info['text'] | |
phrases_to_insert = [] | |
for verse_idx, verse_text in enumerate(verses, 1): | |
# Split verse into words | |
words = verse_text.split() | |
# Process phrases of different lengths | |
for length in range(1, max_phrase_length + 1): | |
for start in range(len(words) - length + 1): | |
phrase = " ".join(words[start:start + length]) | |
cleaned_phrase = strip_diacritics(phrase) | |
gematria_sum = calculate_gematria(cleaned_phrase.replace(" ", "")) | |
# Calculate word position range | |
word_position_range = f"{total_word_count + start + 1}-{total_word_count + start + length}" | |
# Add to batch insert list | |
phrases_to_insert.append( | |
(gematria_sum, cleaned_phrase, "", sura_name, sura_id, verse_idx, length, word_position_range) | |
) | |
# Update total word count after processing each verse | |
total_word_count += len(words) | |
# If we have phrases to insert, do a batch insert | |
if phrases_to_insert: | |
try: | |
cursor.executemany(''' | |
INSERT OR IGNORE INTO results | |
(gematria_sum, words, translation, book, chapter, verse, phrase_length, word_position) | |
VALUES (?, ?, ?, ?, ?, ?, ?, ?) | |
''', phrases_to_insert) | |
# Update processed_books after processing each book | |
cursor.execute(''' | |
INSERT OR REPLACE INTO processed_books (book, max_phrase_length) | |
VALUES (?, ?) | |
''', (sura_name, max_phrase_length)) | |
conn.commit() | |
except sqlite3.Error as e: | |
logger.error(f"Database error: {e} for sura {sura_id}") | |
else: | |
logger.warning(f"Sura {sura_id} not found in processed data") | |
logger.info("Quran database initialization completed successfully") | |