Spaces:

neuralworm
/

xor_tanakh_2

Sleeping

xor_tanakh_2 / core /utils.py

initial commit

f054e62 9 days ago

1.2 kB

	import json
	import re
	from gematria import HEBREW_GEMATRIA_VALUES, calculate_gematria

	def load_torah_corpus(as_sentences=False):
	"""Lädt die gesamte Tora und gibt sie als Liste von Wörtern oder Sätzen zurück."""
	all_words = []
	all_sentences = []
	for i in range(1, 6): # Bücher 1 bis 5
	try:
	with open(f"texts/torah/{i:02}.json", 'r', encoding='utf-8') as file:
	data = json.load(file)
	full_text = ' '.join([' '.join(block) for block in data["text"]])
	text_no_brackets = re.sub(r"\[.*?\]", "", full_text, flags=re.DOTALL)
	clean_text = re.sub(r"[^\u05D0-\u05EA\s]+", "", text_no_brackets)
	words = clean_text.split()
	if words:
	all_words.extend(words)
	# Für Word2Vec brauchen wir eine Satzstruktur (Liste von Listen von Wörtern)
	all_sentences.append(words)
	except FileNotFoundError:
	continue
	return all_sentences if as_sentences else all_words

	# Konstanten für die Wiederverwendung
	SORTED_GEMATRIA = sorted(HEBREW_GEMATRIA_VALUES.items(), key=lambda item: item[1], reverse=True)