tanach_clock / utils.py
bartman081523
initial commit
97260a5
raw
history blame
3.5 kB
import json
import logging
def process_json_files(start, end):
"""
Processes JSON files containing Tanach text and returns a dictionary
mapping book IDs to their data.
Args:
start: The starting book ID (inclusive).
end: The ending book ID (inclusive).
Returns:
A dictionary where keys are book IDs and values are dictionaries
containing 'title' and 'text' fields.
"""
base_path = "texts"
results = {}
for i in range(start, end + 1):
file_name = f"{base_path}/{i:02}.json"
try:
with open(file_name, 'r', encoding='utf-8') as file:
data = json.load(file)
if data:
results[i] = {"title": data.get("title", "No title"), "text": data.get("text", [])}
except FileNotFoundError:
logging.warning(f"File {file_name} not found.")
except json.JSONDecodeError as e:
logging.warning(f"File {file_name} could not be read as JSON: {e}")
except KeyError as e:
logging.warning(f"Expected key 'text' is missing in {file_name}: {e}")
return results
def flatten_text_with_line_breaks(text):
"""
Flattens nested lists while preserving line breaks.
"""
flattened_text = []
for item in text:
if isinstance(item, list):
flattened_text.extend(flatten_text_with_line_breaks(item))
elif isinstance(item, str):
flattened_text.append(item)
else:
flattened_text.append(str(item))
return flattened_text
def calculate_tanach_statistics(tanach_data):
"""
Calculates statistics for the Tanach corpus.
"""
book_stats = {}
total_chapters = 0
total_verses = 0
total_words = 0
for book_id in tanach_data:
book_title = tanach_data[book_id]["title"]
chapters = tanach_data[book_id]["text"]
book_chapters = len(chapters)
book_verses = 0
book_words = 0
for chapter in chapters:
flattened_chapter = flatten_text_with_line_breaks(chapter)
book_verses += len(flattened_chapter)
book_words += len(flattened_chapter)
total_chapters += book_chapters
total_verses += book_verses
total_words += book_words
book_stats[book_id] = {
"title": book_title,
"chapters": book_chapters,
"verses": book_verses,
"words": book_words
}
average_words_per_verse = total_words / total_verses if total_verses > 0 else 0
corpus_stats = {
"total_books": len(tanach_data),
"total_chapters": total_chapters,
"total_verses": total_verses,
"total_words": total_words,
"average_words_per_verse": average_words_per_verse,
"book_stats": book_stats
}
return corpus_stats
def build_word_index(tanach_data):
"""
Builds a word index for efficient lookup.
"""
word_index = {}
word_count = 0
for book_id in tanach_data:
for chapter_index, chapter in enumerate(tanach_data[book_id]["text"]):
flattened_chapter = flatten_text_with_line_breaks(chapter)
for verse_index, word in enumerate(flattened_chapter):
word_index[word_count] = {
"book_id": book_id,
"chapter_id": chapter_index,
"verse_id": verse_index + 1,
}
word_count += 1
return word_index