import streamlit as st from database import db # Import Firestore client import html import re import numpy as np import torch from transformers import pipeline import soundfile as sf def get_text_from_firebase(text_id): """Retrieve text from Firebase Firestore.""" try: doc = db.collection("texts").document(text_id).get() if doc.exists: return doc.to_dict()["content"] else: return None except Exception as e: st.error(f"Error retrieving text from Firebase: {e}") return None def process_text(text): """Clean and structure the extracted text.""" text = re.sub(r'\n\s*\n', '\n\n', text) text = re.sub(r'^(\s*•\s+)(.*)$', r'
  • \2
  • ', text, flags=re.MULTILINE) text = re.sub(r'(
  • .*
  • \n)+', r'\n', text) text = re.sub(r'^([A-Z][A-Z\s]+:?)\s*$', r'

    \1

    ', text, flags=re.MULTILINE) text = re.sub(r'(https?://\S+)', r'\1', text) paragraphs = text.split('\n\n') processed = [] for p in paragraphs: p = p.strip() if p: if p.startswith('