Spaces:

neuralworm
/

xor_tanakh_2

Sleeping

File size: 8,139 Bytes

f054e62

import json
import logging
import argparse
import numpy as np
import sys
import os
import re
from collections import defaultdict
from gensim.models import Word2Vec
from gematria import letter_to_value, HEBREW_GEMATRIA_VALUES

# --- Konfiguration ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
HOLOGRAPHIC_STATE_SIZE_BITS = 4096  # Feste State-Größe in Bits
BITS_PER_CHAR = 16                  # Bits pro Gematria-Wert
BOOK_RANGE = range(1, 40)           # Bücher 1 bis 39
MODELS_DIR = "models_by_book"
INDICES_DIR = "indices_by_book"
SORTED_GEMATRIA = sorted(HEBREW_GEMATRIA_VALUES.items(), key=lambda item: item[1], reverse=True)

# --- Hilfsfunktionen für das Laden von Daten ---

def load_all_data():
    """Lädt alle 39 Index- und Modelldateien."""
    all_indices = {}
    all_models = {}
    logging.info("Lade Daten für alle Bücher...")
    for i in BOOK_RANGE:
        index_path = os.path.join(INDICES_DIR, f"book_{i:02}_index.json")
        model_path = os.path.join(MODELS_DIR, f"book_{i:02}.w2v")
        if os.path.exists(index_path) and os.path.exists(model_path):
            try:
                with open(index_path, 'r', encoding='utf-8') as f:
                    all_indices[i] = json.load(f)
                all_models[i] = Word2Vec.load(model_path)
            except Exception as e:
                logging.error(f"Konnte Daten für Buch {i:02} nicht laden: {e}")
                continue
    if not all_indices:
        logging.error("Keine Index/Modell-Dateien gefunden. Bitte 'build_all_indices.py' ausführen.")
        return None, None
    logging.info(f"{len(all_indices)} Buch-Netzwerke erfolgreich geladen.")
    return all_indices, all_models

# --- Kernlogik: Holographic State Engine ---

def text_to_gematria_binary(text):
    """Wandelt einen Text in einen Gematria-basierten Binärstring um."""
    clean_text = re.sub(r"[^\u05D0-\u05EA]+", "", text)
    return "".join(format(letter_to_value(c), f'0{BITS_PER_CHAR}b') for c in clean_text)

def fold_into_state(binary_string, state_size=HOLOGRAPHIC_STATE_SIZE_BITS):
    """
    Faltet einen beliebig langen Binärstring in einen State fester Größe.
    Dies ist die Kernfunktion zur Vermeidung von Padding.
    """
    state = np.zeros(state_size, dtype=np.int8)
    for i in range(0, len(binary_string), state_size):
        block = binary_string[i:i+state_size]
        block_padded = block.ljust(state_size, '0')
        block_array = np.array(list(block_padded), dtype=np.int8)
        state = np.bitwise_xor(state, block_array)
    return "".join(state.astype(str))

def create_holographic_context():
    """
    Erstellt den sequentiellen, holographischen State, indem alle Bücher
    nacheinander in einen Zustand fester Größe gefaltet werden.
    """
    logging.info("Erstelle holographischen State durch sequentielles Einfalten aller Bücher...")
    final_state = '0' * HOLOGRAPHIC_STATE_SIZE_BITS

    for i in BOOK_RANGE:
        try:
            with open(f"texts/torah/{i:02}.json", 'r', encoding='utf-8') as file:
                logging.info(f"Falte Buch {i:02} in den State ein...")
                data = json.load(file)
                full_text = ' '.join([' '.join(block) for block in data.get("text", [])])
                clean_text = re.sub(r"[^\u05D0-\u05EA]+", "", re.sub(r"\[.*?\]", "", full_text, flags=re.DOTALL))

                if not clean_text:
                    continue

                book_binary = text_to_gematria_binary(clean_text)
                final_state = fold_into_state(book_binary, HOLOGRAPHIC_STATE_SIZE_BITS)
        except FileNotFoundError:
            logging.warning(f"Datei für Buch {i:02} nicht gefunden, wird übersprungen.")
            continue

    logging.info("Holographischer Tanach-State wurde erfolgreich erstellt.")
    return final_state

# --- Funktionen zur Phrasen-Auswahl und -Verarbeitung ---

def cosine_similarity(v1, v2):
    """Berechnet die Kosinus-Ähnlichkeit zwischen zwei Vektoren."""
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2) + 1e-9)

def get_decomposed_word(number):
    """Fallback: Zerlegt eine Zahl algorithmisch in hebräische Buchstaben."""
    text, remainder = "", number
    for char, value in SORTED_GEMATRIA:
        while remainder >= value:
            text += char
            remainder -= value
    return text

def get_best_phrase_from_book(gematria_val, book_index, method, query_vector=None):
    """Findet die beste Phrase in einem EINZELNEN Buch-Index basierend auf der Methode."""
    candidates = book_index.get(str(gematria_val), {}).get('phrases', [])
    if not candidates: return None
    if method == 'frequency': return min(candidates, key=lambda p: p.get('count', 1))
    if method == 'semantic' and query_vector is not None and not np.all(query_vector == 0):
        return max(candidates, key=lambda p: cosine_similarity(np.array(p['vector']), query_vector))
    if method == 'network':
        pagerank_score = book_index.get(str(gematria_val), {}).get('pagerank', 0)
        return max(candidates, key=lambda p: pagerank_score / p.get('count', 1))
    return candidates[0]

def process_query_holographic(query_text, tanakh_state, all_indices, all_models, method):
    """
    Verarbeitet die Abfrage gegen den holographischen State und gibt die
    nach Büchern gruppierten Ergebnisse zurück.
    """
    # Schritt 1: Falte die Anfrage in den State ein, um das finale "Konzept" zu erhalten
    query_binary = text_to_gematria_binary(query_text)
    konzept_state = fold_into_state(query_binary, HOLOGRAPHIC_STATE_SIZE_BITS)
    final_konzept = "".join(str(int(a) ^ int(b)) for a, b in zip(tanakh_state, konzept_state))

    # Schritt 2: Extrahiere Gematria-Werte aus dem Konzept und frage Netzwerke ab
    results_by_book = defaultdict(list)
    for i in range(0, HOLOGRAPHIC_STATE_SIZE_BITS, BITS_PER_CHAR):
        gematria_val = int(final_konzept[i:i+BITS_PER_CHAR], 2)
        if gematria_val == 0: continue

        for book_num, book_index in all_indices.items():
            book_model = all_models[book_num]
            query_vector = np.mean([book_model.wv[w] for w in query_text.split() if w in book_model.wv] or [np.zeros(book_model.vector_size)], axis=0) if method == 'semantic' else None
            best_phrase_data = get_best_phrase_from_book(gematria_val, book_index, method, query_vector)
            if best_phrase_data:
                results_by_book[book_num].append(best_phrase_data['text'])

    # Schritt 3: Formatiere die Ausgabe
    output_string = ""
    for book_num in sorted(results_by_book.keys()):
        unique_phrases = sorted(list(set(results_by_book[book_num])), key=results_by_book[book_num].index)
        phrases_str = " | ".join(unique_phrases)
        if phrases_str:
             output_string += f"\n--- Buch {book_num:02} ---\n{phrases_str}"

    return output_string

# --- Hauptprogramm ---
def main(args):
    """Lädt Daten, erstellt den State und führt die holographische Abfrage aus."""
    all_indices, all_models = load_all_data()
    if not all_indices:
        sys.exit(1)

    holographic_tanakh_state = create_holographic_context()
    if not holographic_tanakh_state:
        sys.exit(1)

    # Es gibt keine Iterationen mehr in diesem Modell, da die Anfrage Teil der State-Erstellung ist.
    print(f"\n" + "="*15 + f" HOLOGRAPHISCHE ABFRAGE (Methode: {args.method}) " + "="*15)
    logging.info(f"Aktuelle Abfrage: '{args.query}'")

    result_text = process_query_holographic(args.query, holographic_tanakh_state, all_indices, all_models, args.method)

    print("\nErgebnis aus dem holographischen State, geordnet nach Büchern:")
    print(result_text)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Holographic XOR Gematria Machine.")
    parser.add_argument("query", type=str, help="Die anfängliche Abfragephrase.")
    parser.add_argument("--method", type=str, choices=['frequency', 'semantic', 'network', 'default'], default='default', help="Die Gewichtungsmethode für die Phrasenauswahl.")

    args = parser.parse_args()
    main(args)