Spaces:
Runtime error
Runtime error
File size: 3,503 Bytes
97260a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import json
import logging
def process_json_files(start, end):
"""
Processes JSON files containing Tanach text and returns a dictionary
mapping book IDs to their data.
Args:
start: The starting book ID (inclusive).
end: The ending book ID (inclusive).
Returns:
A dictionary where keys are book IDs and values are dictionaries
containing 'title' and 'text' fields.
"""
base_path = "texts"
results = {}
for i in range(start, end + 1):
file_name = f"{base_path}/{i:02}.json"
try:
with open(file_name, 'r', encoding='utf-8') as file:
data = json.load(file)
if data:
results[i] = {"title": data.get("title", "No title"), "text": data.get("text", [])}
except FileNotFoundError:
logging.warning(f"File {file_name} not found.")
except json.JSONDecodeError as e:
logging.warning(f"File {file_name} could not be read as JSON: {e}")
except KeyError as e:
logging.warning(f"Expected key 'text' is missing in {file_name}: {e}")
return results
def flatten_text_with_line_breaks(text):
"""
Flattens nested lists while preserving line breaks.
"""
flattened_text = []
for item in text:
if isinstance(item, list):
flattened_text.extend(flatten_text_with_line_breaks(item))
elif isinstance(item, str):
flattened_text.append(item)
else:
flattened_text.append(str(item))
return flattened_text
def calculate_tanach_statistics(tanach_data):
"""
Calculates statistics for the Tanach corpus.
"""
book_stats = {}
total_chapters = 0
total_verses = 0
total_words = 0
for book_id in tanach_data:
book_title = tanach_data[book_id]["title"]
chapters = tanach_data[book_id]["text"]
book_chapters = len(chapters)
book_verses = 0
book_words = 0
for chapter in chapters:
flattened_chapter = flatten_text_with_line_breaks(chapter)
book_verses += len(flattened_chapter)
book_words += len(flattened_chapter)
total_chapters += book_chapters
total_verses += book_verses
total_words += book_words
book_stats[book_id] = {
"title": book_title,
"chapters": book_chapters,
"verses": book_verses,
"words": book_words
}
average_words_per_verse = total_words / total_verses if total_verses > 0 else 0
corpus_stats = {
"total_books": len(tanach_data),
"total_chapters": total_chapters,
"total_verses": total_verses,
"total_words": total_words,
"average_words_per_verse": average_words_per_verse,
"book_stats": book_stats
}
return corpus_stats
def build_word_index(tanach_data):
"""
Builds a word index for efficient lookup.
"""
word_index = {}
word_count = 0
for book_id in tanach_data:
for chapter_index, chapter in enumerate(tanach_data[book_id]["text"]):
flattened_chapter = flatten_text_with_line_breaks(chapter)
for verse_index, word in enumerate(flattened_chapter):
word_index[word_count] = {
"book_id": book_id,
"chapter_id": chapter_index,
"verse_id": verse_index + 1,
}
word_count += 1
return word_index
|