import json import logging def process_json_files(start, end): """ Processes JSON files containing Tanach text and returns a dictionary mapping book IDs to their data. Args: start: The starting book ID (inclusive). end: The ending book ID (inclusive). Returns: A dictionary where keys are book IDs and values are dictionaries containing 'title' and 'text' fields. """ base_path = "texts" results = {} for i in range(start, end + 1): file_name = f"{base_path}/{i:02}.json" try: with open(file_name, 'r', encoding='utf-8') as file: data = json.load(file) if data: results[i] = {"title": data.get("title", "No title"), "text": data.get("text", [])} except FileNotFoundError: logging.warning(f"File {file_name} not found.") except json.JSONDecodeError as e: logging.warning(f"File {file_name} could not be read as JSON: {e}") except KeyError as e: logging.warning(f"Expected key 'text' is missing in {file_name}: {e}") return results def flatten_text_with_line_breaks(text): """ Flattens nested lists while preserving line breaks. """ flattened_text = [] for item in text: if isinstance(item, list): flattened_text.extend(flatten_text_with_line_breaks(item)) elif isinstance(item, str): flattened_text.append(item) else: flattened_text.append(str(item)) return flattened_text def calculate_tanach_statistics(tanach_data): """ Calculates statistics for the Tanach corpus. """ book_stats = {} total_chapters = 0 total_verses = 0 total_words = 0 for book_id in tanach_data: book_title = tanach_data[book_id]["title"] chapters = tanach_data[book_id]["text"] book_chapters = len(chapters) book_verses = 0 book_words = 0 for chapter in chapters: flattened_chapter = flatten_text_with_line_breaks(chapter) book_verses += len(flattened_chapter) book_words += len(flattened_chapter) total_chapters += book_chapters total_verses += book_verses total_words += book_words book_stats[book_id] = { "title": book_title, "chapters": book_chapters, "verses": book_verses, "words": book_words } average_words_per_verse = total_words / total_verses if total_verses > 0 else 0 corpus_stats = { "total_books": len(tanach_data), "total_chapters": total_chapters, "total_verses": total_verses, "total_words": total_words, "average_words_per_verse": average_words_per_verse, "book_stats": book_stats } return corpus_stats def build_word_index(tanach_data): """ Builds a word index for efficient lookup. """ word_index = {} word_count = 0 for book_id in tanach_data: for chapter_index, chapter in enumerate(tanach_data[book_id]["text"]): flattened_chapter = flatten_text_with_line_breaks(chapter) for verse_index, word in enumerate(flattened_chapter): word_index[word_count] = { "book_id": book_id, "chapter_id": chapter_index, "verse_id": verse_index + 1, } word_count += 1 return word_index