Spaces:
Runtime error
Runtime error
import json | |
import logging | |
def process_json_files(start, end): | |
""" | |
Processes JSON files containing Tanach text and returns a dictionary | |
mapping book IDs to their data. | |
Args: | |
start: The starting book ID (inclusive). | |
end: The ending book ID (inclusive). | |
Returns: | |
A dictionary where keys are book IDs and values are dictionaries | |
containing 'title' and 'text' fields. | |
""" | |
base_path = "texts" | |
results = {} | |
for i in range(start, end + 1): | |
file_name = f"{base_path}/{i:02}.json" | |
try: | |
with open(file_name, 'r', encoding='utf-8') as file: | |
data = json.load(file) | |
if data: | |
results[i] = {"title": data.get("title", "No title"), "text": data.get("text", [])} | |
except FileNotFoundError: | |
logging.warning(f"File {file_name} not found.") | |
except json.JSONDecodeError as e: | |
logging.warning(f"File {file_name} could not be read as JSON: {e}") | |
except KeyError as e: | |
logging.warning(f"Expected key 'text' is missing in {file_name}: {e}") | |
return results | |
def flatten_text_with_line_breaks(text): | |
""" | |
Flattens nested lists while preserving line breaks. | |
""" | |
flattened_text = [] | |
for item in text: | |
if isinstance(item, list): | |
flattened_text.extend(flatten_text_with_line_breaks(item)) | |
elif isinstance(item, str): | |
flattened_text.append(item) | |
else: | |
flattened_text.append(str(item)) | |
return flattened_text | |
def calculate_tanach_statistics(tanach_data): | |
""" | |
Calculates statistics for the Tanach corpus. | |
""" | |
book_stats = {} | |
total_chapters = 0 | |
total_verses = 0 | |
total_words = 0 | |
for book_id in tanach_data: | |
book_title = tanach_data[book_id]["title"] | |
chapters = tanach_data[book_id]["text"] | |
book_chapters = len(chapters) | |
book_verses = 0 | |
book_words = 0 | |
for chapter in chapters: | |
flattened_chapter = flatten_text_with_line_breaks(chapter) | |
book_verses += len(flattened_chapter) | |
book_words += len(flattened_chapter) | |
total_chapters += book_chapters | |
total_verses += book_verses | |
total_words += book_words | |
book_stats[book_id] = { | |
"title": book_title, | |
"chapters": book_chapters, | |
"verses": book_verses, | |
"words": book_words | |
} | |
average_words_per_verse = total_words / total_verses if total_verses > 0 else 0 | |
corpus_stats = { | |
"total_books": len(tanach_data), | |
"total_chapters": total_chapters, | |
"total_verses": total_verses, | |
"total_words": total_words, | |
"average_words_per_verse": average_words_per_verse, | |
"book_stats": book_stats | |
} | |
return corpus_stats | |
def build_word_index(tanach_data): | |
""" | |
Builds a word index for efficient lookup. | |
""" | |
word_index = {} | |
word_count = 0 | |
for book_id in tanach_data: | |
for chapter_index, chapter in enumerate(tanach_data[book_id]["text"]): | |
flattened_chapter = flatten_text_with_line_breaks(chapter) | |
for verse_index, word in enumerate(flattened_chapter): | |
word_index[word_count] = { | |
"book_id": book_id, | |
"chapter_id": chapter_index, | |
"verse_id": verse_index + 1, | |
} | |
word_count += 1 | |
return word_index | |