Spaces:

neuralworm
/

xor_tanakh_2

Sleeping

File size: 1,199 Bytes

f054e62

import json
import re
from gematria import HEBREW_GEMATRIA_VALUES, calculate_gematria

def load_torah_corpus(as_sentences=False):
    """Lädt die gesamte Tora und gibt sie als Liste von Wörtern oder Sätzen zurück."""
    all_words = []
    all_sentences = []
    for i in range(1, 6): # Bücher 1 bis 5
        try:
            with open(f"texts/torah/{i:02}.json", 'r', encoding='utf-8') as file:
                data = json.load(file)
                full_text = ' '.join([' '.join(block) for block in data["text"]])
                text_no_brackets = re.sub(r"\[.*?\]", "", full_text, flags=re.DOTALL)
                clean_text = re.sub(r"[^\u05D0-\u05EA\s]+", "", text_no_brackets)
                words = clean_text.split()
                if words:
                    all_words.extend(words)
                    # Für Word2Vec brauchen wir eine Satzstruktur (Liste von Listen von Wörtern)
                    all_sentences.append(words)
        except FileNotFoundError:
            continue
    return all_sentences if as_sentences else all_words

# Konstanten für die Wiederverwendung
SORTED_GEMATRIA = sorted(HEBREW_GEMATRIA_VALUES.items(), key=lambda item: item[1], reverse=True)