Spaces:

neuralworm
/

xor_tanakh_2

Sleeping

App Files Files Community

xor_tanakh_2 / run_query.py

neuralworm

initial commit

f054e62 9 days ago

raw

history blame contribute delete

8.14 kB

	import json
	import logging
	import argparse
	import numpy as np
	import sys
	import os
	import re
	from collections import defaultdict
	from gensim.models import Word2Vec
	from gematria import letter_to_value, HEBREW_GEMATRIA_VALUES

	# --- Konfiguration ---
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	HOLOGRAPHIC_STATE_SIZE_BITS = 4096 # Feste State-Größe in Bits
	BITS_PER_CHAR = 16 # Bits pro Gematria-Wert
	BOOK_RANGE = range(1, 40) # Bücher 1 bis 39
	MODELS_DIR = "models_by_book"
	INDICES_DIR = "indices_by_book"
	SORTED_GEMATRIA = sorted(HEBREW_GEMATRIA_VALUES.items(), key=lambda item: item[1], reverse=True)

	# --- Hilfsfunktionen für das Laden von Daten ---

	def load_all_data():
	"""Lädt alle 39 Index- und Modelldateien."""
	all_indices = {}
	all_models = {}
	logging.info("Lade Daten für alle Bücher...")
	for i in BOOK_RANGE:
	index_path = os.path.join(INDICES_DIR, f"book_{i:02}_index.json")
	model_path = os.path.join(MODELS_DIR, f"book_{i:02}.w2v")
	if os.path.exists(index_path) and os.path.exists(model_path):
	try:
	with open(index_path, 'r', encoding='utf-8') as f:
	all_indices[i] = json.load(f)
	all_models[i] = Word2Vec.load(model_path)
	except Exception as e:
	logging.error(f"Konnte Daten für Buch {i:02} nicht laden: {e}")
	continue
	if not all_indices:
	logging.error("Keine Index/Modell-Dateien gefunden. Bitte 'build_all_indices.py' ausführen.")
	return None, None
	logging.info(f"{len(all_indices)} Buch-Netzwerke erfolgreich geladen.")
	return all_indices, all_models

	# --- Kernlogik: Holographic State Engine ---

	def text_to_gematria_binary(text):
	"""Wandelt einen Text in einen Gematria-basierten Binärstring um."""
	clean_text = re.sub(r"[^\u05D0-\u05EA]+", "", text)
	return "".join(format(letter_to_value(c), f'0{BITS_PER_CHAR}b') for c in clean_text)

	def fold_into_state(binary_string, state_size=HOLOGRAPHIC_STATE_SIZE_BITS):
	"""
	Faltet einen beliebig langen Binärstring in einen State fester Größe.
	Dies ist die Kernfunktion zur Vermeidung von Padding.
	"""
	state = np.zeros(state_size, dtype=np.int8)
	for i in range(0, len(binary_string), state_size):
	block = binary_string[i:i+state_size]
	block_padded = block.ljust(state_size, '0')
	block_array = np.array(list(block_padded), dtype=np.int8)
	state = np.bitwise_xor(state, block_array)
	return "".join(state.astype(str))

	def create_holographic_context():
	"""
	Erstellt den sequentiellen, holographischen State, indem alle Bücher
	nacheinander in einen Zustand fester Größe gefaltet werden.
	"""
	logging.info("Erstelle holographischen State durch sequentielles Einfalten aller Bücher...")
	final_state = '0' * HOLOGRAPHIC_STATE_SIZE_BITS

	for i in BOOK_RANGE:
	try:
	with open(f"texts/torah/{i:02}.json", 'r', encoding='utf-8') as file:
	logging.info(f"Falte Buch {i:02} in den State ein...")
	data = json.load(file)
	full_text = ' '.join([' '.join(block) for block in data.get("text", [])])
	clean_text = re.sub(r"[^\u05D0-\u05EA]+", "", re.sub(r"\[.*?\]", "", full_text, flags=re.DOTALL))

	if not clean_text:
	continue

	book_binary = text_to_gematria_binary(clean_text)
	final_state = fold_into_state(book_binary, HOLOGRAPHIC_STATE_SIZE_BITS)
	except FileNotFoundError:
	logging.warning(f"Datei für Buch {i:02} nicht gefunden, wird übersprungen.")
	continue

	logging.info("Holographischer Tanach-State wurde erfolgreich erstellt.")
	return final_state

	# --- Funktionen zur Phrasen-Auswahl und -Verarbeitung ---

	def cosine_similarity(v1, v2):
	"""Berechnet die Kosinus-Ähnlichkeit zwischen zwei Vektoren."""
	return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2) + 1e-9)

	def get_decomposed_word(number):
	"""Fallback: Zerlegt eine Zahl algorithmisch in hebräische Buchstaben."""
	text, remainder = "", number
	for char, value in SORTED_GEMATRIA:
	while remainder >= value:
	text += char
	remainder -= value
	return text

	def get_best_phrase_from_book(gematria_val, book_index, method, query_vector=None):
	"""Findet die beste Phrase in einem EINZELNEN Buch-Index basierend auf der Methode."""
	candidates = book_index.get(str(gematria_val), {}).get('phrases', [])
	if not candidates: return None
	if method == 'frequency': return min(candidates, key=lambda p: p.get('count', 1))
	if method == 'semantic' and query_vector is not None and not np.all(query_vector == 0):
	return max(candidates, key=lambda p: cosine_similarity(np.array(p['vector']), query_vector))
	if method == 'network':
	pagerank_score = book_index.get(str(gematria_val), {}).get('pagerank', 0)
	return max(candidates, key=lambda p: pagerank_score / p.get('count', 1))
	return candidates[0]

	def process_query_holographic(query_text, tanakh_state, all_indices, all_models, method):
	"""
	Verarbeitet die Abfrage gegen den holographischen State und gibt die
	nach Büchern gruppierten Ergebnisse zurück.
	"""
	# Schritt 1: Falte die Anfrage in den State ein, um das finale "Konzept" zu erhalten
	query_binary = text_to_gematria_binary(query_text)
	konzept_state = fold_into_state(query_binary, HOLOGRAPHIC_STATE_SIZE_BITS)
	final_konzept = "".join(str(int(a) ^ int(b)) for a, b in zip(tanakh_state, konzept_state))

	# Schritt 2: Extrahiere Gematria-Werte aus dem Konzept und frage Netzwerke ab
	results_by_book = defaultdict(list)
	for i in range(0, HOLOGRAPHIC_STATE_SIZE_BITS, BITS_PER_CHAR):
	gematria_val = int(final_konzept[i:i+BITS_PER_CHAR], 2)
	if gematria_val == 0: continue

	for book_num, book_index in all_indices.items():
	book_model = all_models[book_num]
	query_vector = np.mean([book_model.wv[w] for w in query_text.split() if w in book_model.wv] or [np.zeros(book_model.vector_size)], axis=0) if method == 'semantic' else None
	best_phrase_data = get_best_phrase_from_book(gematria_val, book_index, method, query_vector)
	if best_phrase_data:
	results_by_book[book_num].append(best_phrase_data['text'])

	# Schritt 3: Formatiere die Ausgabe
	output_string = ""
	for book_num in sorted(results_by_book.keys()):
	unique_phrases = sorted(list(set(results_by_book[book_num])), key=results_by_book[book_num].index)
	phrases_str = " \| ".join(unique_phrases)
	if phrases_str:
	output_string += f"\n--- Buch {book_num:02} ---\n{phrases_str}"

	return output_string

	# --- Hauptprogramm ---
	def main(args):
	"""Lädt Daten, erstellt den State und führt die holographische Abfrage aus."""
	all_indices, all_models = load_all_data()
	if not all_indices:
	sys.exit(1)

	holographic_tanakh_state = create_holographic_context()
	if not holographic_tanakh_state:
	sys.exit(1)

	# Es gibt keine Iterationen mehr in diesem Modell, da die Anfrage Teil der State-Erstellung ist.
	print(f"\n" + "="15 + f" HOLOGRAPHISCHE ABFRAGE (Methode: {args.method}) " + "="15)
	logging.info(f"Aktuelle Abfrage: '{args.query}'")

	result_text = process_query_holographic(args.query, holographic_tanakh_state, all_indices, all_models, args.method)

	print("\nErgebnis aus dem holographischen State, geordnet nach Büchern:")
	print(result_text)

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Holographic XOR Gematria Machine.")
	parser.add_argument("query", type=str, help="Die anfängliche Abfragephrase.")
	parser.add_argument("--method", type=str, choices=['frequency', 'semantic', 'network', 'default'], default='default', help="Die Gewichtungsmethode für die Phrasenauswahl.")

	args = parser.parse_args()
	main(args)