import os
import faiss
import numpy as np
import json
from docx import Document
from sentence_transformers import SentenceTransformer
import gradio as gr
import tempfile

# ---------- تنظیمات ---------- 
OUTPUT_DIR = "./output_faiss"  # مسیر ذخیره فایل‌های خروجی
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# ---------- تبدیل فایل docx به ساختار JSON ----------
def docx_to_sections(docx_path):
    doc = Document(docx_path)
    sections = []
    current_h1 = None
    current_h2 = None
    buffer = ""

    for para in doc.paragraphs:
        style = para.style.name
        text = para.text.strip()

        if not text:
            continue

        if style.startswith("Heading 1"):
            if current_h2:
                sections.append({
                    "heading": current_h2,
                    "content": buffer.strip(),
                    "full_content": buffer.strip(),
                    "parent": current_h1
                })
                current_h2 = None
                buffer = ""

            if current_h1 and buffer:
                sections.append({
                    "heading": current_h1,
                    "content": buffer.strip(),
                    "full_content": buffer.strip()
                })
            current_h1 = text
            buffer = ""

        elif style.startswith("Heading 2"):
            if current_h2:
                sections.append({
                    "heading": current_h2,
                    "content": buffer.strip(),
                    "full_content": buffer.strip(),
                    "parent": current_h1
                })
            current_h2 = text
            buffer = ""

        else:
            buffer += text + "\n"

    if current_h2:
        sections.append({
            "heading": current_h2,
            "content": buffer.strip(),
            "full_content": buffer.strip(),
            "parent": current_h1
        })
    elif current_h1:
        sections.append({
            "heading": current_h1,
            "content": buffer.strip(),
            "full_content": buffer.strip()
        })

    return sections

# ---------- تولید embedding ----------
def generate_embeddings(sections, model):
    texts = [s['content'] for s in sections]
    embeddings = model.encode(
        texts,
        convert_to_numpy=True,
        normalize_embeddings=True,  # نرمال‌سازی برای دقت بهتر در FAISS
        show_progress_bar=True
    )
    return embeddings.astype("float32")

# ---------- ذخیره FAISS + متادیتا ----------
def save_faiss_and_metadata(embeddings, sections, base_name):
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    d = embeddings.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(embeddings)

    faiss_path = os.path.join(OUTPUT_DIR, f"faiss_index_{base_name}.bin")
    metadata_path = os.path.join(OUTPUT_DIR, f"metadata_{base_name}.json")

    faiss.write_index(index, faiss_path)

    with open(metadata_path, "w", encoding="utf-8") as f:
        json.dump(sections, f, ensure_ascii=False, indent=2)

    print(f"✅ ذخیره شد:\n - {faiss_path}\n - {metadata_path}")

# ---------- اجرای کامل ----------
def build_from_docx(docx_file):
    # ذخیره فایل موقت برای پردازش
    with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_file:
        temp_file.write(docx_file.read())
        temp_file_path = temp_file.name

    print(f"📄 پردازش فایل: {temp_file_path}")
    sections = docx_to_sections(temp_file_path)
    print(f"🧩 {len(sections)} بخش استخراج شد.")

    model = SentenceTransformer(EMBEDDING_MODEL_NAME)
    embeddings = generate_embeddings(sections, model)

    base_name = os.path.splitext(os.path.basename(temp_file_path))[0].lower()
    save_faiss_and_metadata(embeddings, sections, base_name)

    return f"فایل‌های FAISS و متادیتا ایجاد شد و در مسیر {OUTPUT_DIR} ذخیره شدند."

# ---------- رابط کاربری Gradio ----------
def process_docx(file):
    result = build_from_docx(file)
    return result

iface = gr.Interface(
    fn=process_docx,
    inputs=gr.File(file_count="single", type="file", label="Upload DOCX File"),
    outputs=gr.Textbox(label="Output"),
    title="Docx to FAISS & Metadata Generator",
    description="Upload a DOCX file, and it will process the contents to generate FAISS index and metadata."
)

if __name__ == "__main__":
    iface.launch()