xor_tanakh_2 / explore_tanakh.py
neuralworm's picture
initial commit
f054e62
raw
history blame
10.7 kB
import json
import logging
import argparse
import numpy as np
import sys
import os
import re
from collections import Counter
import pickle
from gematria import letter_to_value, HEBREW_GEMATRIA_VALUES, linearize_umlauts, decompose_to_latin
# --- Konfiguration ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.getLogger("gensim").setLevel(logging.WARNING)
HOLOGRAPHIC_STATE_SIZE_BITS = 4096
BITS_PER_CHAR = 16
BOOK_RANGE = range(1, 40)
MODELS_DIR = "models_by_book"
INDICES_DIR = "indices_by_book"
CACHE_FILE = "tanakh_data.cache"
SORTED_GEMATRIA = sorted(HEBREW_GEMATRIA_VALUES.items(), key=lambda item: item[1], reverse=True)
def setup_logging(debug_mode):
level = logging.DEBUG if debug_mode else logging.INFO
logging.getLogger().setLevel(level)
# --- Kern-Engine als Klasse ---
class TanakhExplorer:
def __init__(self, use_cache=True):
self.all_indices = {}
self.tanakh_state = None
cache_valid = use_cache and os.path.exists(CACHE_FILE)
if cache_valid:
try:
logging.info(f"Lade Daten aus Cache-Datei: {CACHE_FILE}")
with open(CACHE_FILE, 'rb') as f:
cached_data = pickle.load(f)
self.all_indices = cached_data.get('indices', {})
self.tanakh_state = cached_data.get('state')
logging.info("Daten erfolgreich aus Cache geladen.")
except Exception as e:
logging.warning(f"Cache-Datei ist korrupt oder konnte nicht geladen werden: {e}. Lade Daten neu.")
cache_valid = False
if not cache_valid or not self.all_indices or not self.tanakh_state:
self._load_all_indices()
self._create_tanakh_holographic_state()
if use_cache:
self._save_to_cache()
def _load_all_indices(self):
logging.info("Lade Index-Dateien für alle Bücher...")
for i in BOOK_RANGE:
index_path = os.path.join(INDICES_DIR, f"book_{i:02}_index.json")
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f: self.all_indices[i] = json.load(f)
if not self.all_indices: sys.exit("Keine Index-Dateien gefunden. Bitte 'build_indices.py' ausführen.")
logging.info(f"{len(self.all_indices)} Buch-Indizes geladen.")
def _create_tanakh_holographic_state(self):
logging.info("Erstelle holographischen Tanach-State...")
final_state = '0' * HOLOGRAPHIC_STATE_SIZE_BITS
full_binary_text = ""
for i in BOOK_RANGE:
try:
with open(f"texts/torah/{i:02}.json", 'r', encoding='utf-8') as file:
data = json.load(file)
full_text = ' '.join([' '.join(block) for block in data.get("text", [])])
clean_text = re.sub(r"[^\u05D0-\u05EA]+", "", re.sub(r"\[.*?\]", "", full_text, flags=re.DOTALL))
if clean_text:
full_binary_text += self._text_to_gematria_binary(clean_text, for_state=True)
except Exception: continue
self.tanakh_state = self._fold_into_state(full_binary_text)
logging.info("Holographischer Tanach-State wurde erstellt.")
def _save_to_cache(self):
logging.info(f"Speichere Daten in Cache-Datei: {CACHE_FILE}")
data_to_cache = {'indices': self.all_indices, 'state': self.tanakh_state}
with open(CACHE_FILE, 'wb') as f: pickle.dump(data_to_cache, f)
@staticmethod
def _text_to_gematria_binary(text, for_state=False):
text_for_calc = linearize_umlauts(text.lower())
if for_state:
clean_text = re.sub(r"[^\u05D0-\u05EA]+", "", text_for_calc)
else:
clean_text = re.sub(r"[^a-z\u05D0-\u05EA]+", "", text_for_calc)
logging.debug(f"text_to_gematria_binary (for_state={for_state}): Original='{text[:30]}...', Bereinigt='{clean_text[:30]}...'")
binary_string = "".join(format(letter_to_value(c), f'0{BITS_PER_CHAR}b') for c in clean_text)
logging.debug(f" -> erzeugter Binärstring (erste 64 Bits): {binary_string[:64]}")
return binary_string
@staticmethod
def _fold_into_state(binary_string, initial_state=None):
state = np.array(list(initial_state), dtype=np.int8) if initial_state else np.zeros(HOLOGRAPHIC_STATE_SIZE_BITS, dtype=np.int8)
for i in range(0, len(binary_string), HOLOGRAPHIC_STATE_SIZE_BITS):
block = binary_string[i:i+HOLOGRAPHIC_STATE_SIZE_BITS].ljust(HOLOGRAPHIC_STATE_SIZE_BITS, '0')
state = np.bitwise_xor(state, np.array(list(block), dtype=np.int8))
return "".join(state.astype(str))
def get_best_phrase_from_all_books(self, gematria_val, method):
best_overall_phrase_obj = None
best_overall_score = -1.0
for book_num, book_index in self.all_indices.items():
candidates = book_index.get(str(gematria_val), {}).get('phrases', [])
if not candidates: continue
pg_score = book_index.get(str(gematria_val), {}).get('pagerank', 0)
best_in_book = max(candidates, key=lambda p: pg_score / p.get('count', 1) if p.get('count', 0) > 0 else 0)
current_score = pg_score / best_in_book.get('count', 1) if best_in_book.get('count', 0) > 0 else 0
if current_score > best_overall_score:
best_overall_score = current_score
best_in_book['source_book'] = book_num
best_overall_phrase_obj = best_in_book
if best_overall_phrase_obj:
return best_overall_phrase_obj, "exact"
for offset in [1, -1]:
for book_num, book_index in self.all_indices.items():
candidates = book_index.get(str(gematria_val + offset), {}).get('phrases', [])
if candidates:
best_in_book = min(candidates, key=lambda p: p.get('position', float('inf')))
best_in_book['source_book'] = book_num
return best_in_book, f"neighbor(d={offset})"
decomposed = decompose_to_latin(gematria_val)
if decomposed:
return {"text": f"[{decomposed}]", "position": -2, "source_book": "N/A"}, "decomposed"
return None, None
def run_fractal_mode(self, query, depth, method):
print(f"\n" + "="*15 + f" FRAKTALE LOGOS-AUSSCHÖPFUNG (Tiefe: {depth}, Methode: {method}) " + "="*15)
initial_logos = query
# <<<<<<<<<<<<<<<<<< HIER IST DIE KORREKTUR >>>>>>>>>>>>>>>>>>>>
# Wir verwenden 0 für das Quell-Buch, um den TypeError zu vermeiden
all_found_phrases_map = {initial_logos: {"text": initial_logos, "position": -1, "depth": 0, "count":1, "source_book": 0}}
# <<<<<<<<<<<<<<<<<< ENDE DER KORREKTUR >>>>>>>>>>>>>>>>>>>>>
phrases_to_process_this_level = {initial_logos}
for d in range(depth):
logging.info(f"--- Starte Tiefe {d + 1}/{depth} mit {len(phrases_to_process_this_level)} Phrasen ---")
phrases_for_next_level = set()
for p_current in phrases_to_process_this_level:
combined_query = f"{initial_logos} {p_current}"
query_binary = self._text_to_gematria_binary(combined_query)
konzept_state = self._fold_into_state(query_binary)
final_konzept = "".join(str(int(a)^int(b)) for a,b in zip(self.tanakh_state, konzept_state))
for i in range(0, len(final_konzept), BITS_PER_CHAR):
gematria_val = int(final_konzept[i:i+BITS_PER_CHAR], 2)
if gematria_val == 0: continue
phrase_obj, _ = self.get_best_phrase_from_all_books(gematria_val, method)
if phrase_obj:
phrase_text = phrase_obj['text']
if phrase_text not in all_found_phrases_map:
phrase_obj['depth'] = d + 1
phrase_obj['count'] = 1
all_found_phrases_map[phrase_text] = phrase_obj
phrases_for_next_level.add(phrase_text)
else:
all_found_phrases_map[phrase_text]['count'] += 1
if not phrases_for_next_level:
logging.info(f"Keine neuen Phrasen in Tiefe {d + 1} gefunden.")
break
phrases_to_process_this_level = phrases_for_next_level
# Sortiere nach Buch und dann nach Position, um die narrative Ordnung beizubehalten
sorted_by_position = sorted(all_found_phrases_map.values(), key=lambda x: (x.get('source_book', 99), x.get('position', -1)))
print("\n--- Finale Synthese (geordnet nach Buch und Auftreten im Text) ---")
current_book = -1
for p in sorted_by_position:
book = p.get('source_book')
if book != current_book:
# Gib eine Kopfzeile für jedes neue Buch aus
if isinstance(book, int) and book > 0:
print(f"\n--- Buch {book:02d} ---")
elif book == 0:
print(f"--- Query ---")
current_book = book
print(f"{p['text']}", end=" | ")
print("\n")
# Sortiere nach Häufigkeit für die Top-Konzepte
sorted_by_count = sorted(all_found_phrases_map.values(), key=lambda x: x['count'], reverse=True)
print("\n--- Top 25 Resonanz-Konzepte (geordnet nach Häufigkeit im Fraktal) ---")
for p in sorted_by_count[:25]:
source = f"B{p.get('source_book', '??'):02d}" if isinstance(p.get('source_book'), int) and p.get('source_book') > 0 else p.get('source_book', 'N/A')
print(f"[{p['count']:2d}x] {p['text']} (Original in {source}, Pos: {p.get('position', 'N/A')})")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Tanakh Holographic Explorer (v13, Final).")
parser.add_argument("query", type=str, help="Die anfängliche Abfragephrase (Logos).")
parser.add_argument("--method", type=str, choices=['frequency', 'network', 'default'], default='network', help="Gewichtungsmethode.")
parser.add_argument("--depth", type=int, default=1, help="Maximale Tiefe der fraktalen Suche.")
parser.add_argument("--no-cache", action="store_true", help="Erzwingt das Neuladen der Daten.")
parser.add_argument("--debug", action="store_true", help="Aktiviert detaillierte Debug-Ausgaben.")
args = parser.parse_args()
setup_logging(args.debug)
engine = TanakhExplorer(use_cache=not args.no_cache)
engine.run_fractal_mode(args.query, args.depth, args.method)