Spaces:

mostafa202025
/

Docx-FAISS

Sleeping

File size: 4,574 Bytes

import os
import faiss
import numpy as np
import json
from docx import Document
from sentence_transformers import SentenceTransformer
import gradio as gr
import tempfile

# ---------- تنظیمات ---------- 
OUTPUT_DIR = "/tmp/output_faiss"   # مسیر ذخیره فایل‌های خروجی
EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

# ---------- تبدیل فایل docx به ساختار JSON ----------
def docx_to_sections(docx_path):
    doc = Document(docx_path)
    sections = []
    current_h1 = None
    current_h2 = None
    buffer = ""

    for para in doc.paragraphs:
        style = para.style.name
        text = para.text.strip()

        if not text:
            continue

        if style.startswith("Heading 1"):
            if current_h2:
                sections.append({
                    "heading": current_h2,
                    "content": buffer.strip(),
                    "full_content": buffer.strip(),
                    "parent": current_h1
                })
                current_h2 = None
                buffer = ""

            if current_h1 and buffer:
                sections.append({
                    "heading": current_h1,
                    "content": buffer.strip(),
                    "full_content": buffer.strip()
                })
            current_h1 = text
            buffer = ""

        elif style.startswith("Heading 2"):
            if current_h2:
                sections.append({
                    "heading": current_h2,
                    "content": buffer.strip(),
                    "full_content": buffer.strip(),
                    "parent": current_h1
                })
            current_h2 = text
            buffer = ""

        else:
            buffer += text + "\n"

    if current_h2:
        sections.append({
            "heading": current_h2,
            "content": buffer.strip(),
            "full_content": buffer.strip(),
            "parent": current_h1
        })
    elif current_h1:
        sections.append({
            "heading": current_h1,
            "content": buffer.strip(),
            "full_content": buffer.strip()
        })

    return sections

# ---------- تولید embedding ----------
def generate_embeddings(sections, model):
    texts = [s['content'] for s in sections]
    embeddings = model.encode(
        texts,
        convert_to_numpy=True,
        normalize_embeddings=True,  # نرمال‌سازی برای دقت بهتر در FAISS
        show_progress_bar=True
    )
    return embeddings.astype("float32")

# ---------- ذخیره FAISS + متادیتا ----------
def save_faiss_and_metadata(embeddings, sections, base_name):
    # استفاده از دایرکتوری موقت
    temp_dir = tempfile.mkdtemp()
    os.makedirs(temp_dir, exist_ok=True)

    d = embeddings.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(embeddings)

    faiss_path = os.path.join(temp_dir, f"faiss_index_{base_name}.bin")
    metadata_path = os.path.join(temp_dir, f"metadata_{base_name}.json")

    faiss.write_index(index, faiss_path)

    with open(metadata_path, "w", encoding="utf-8") as f:
        json.dump(sections, f, ensure_ascii=False, indent=2)

    print(f"✅ ذخیره شد:\n - {faiss_path}\n - {metadata_path}")
    return faiss_path, metadata_path

def build_from_docx(docx_file_path):
    print(f"📄 پردازش فایل: {docx_file_path}")
    sections = docx_to_sections(docx_file_path)
    print(f"🧩 {len(sections)} بخش استخراج شد.")

    model = SentenceTransformer(EMBEDDING_MODEL_NAME)
    embeddings = generate_embeddings(sections, model)

    base_name = os.path.splitext(os.path.basename(docx_file_path))[0].lower()
    faiss_path, metadata_path = save_faiss_and_metadata(embeddings, sections, base_name)

    return f"فایل‌های FAISS و متادیتا ایجاد شدند.", faiss_path, metadata_path

def process_docx(file):
    message, faiss_path, metadata_path = build_from_docx(file.name)
    return message, gr.File(faiss_path), gr.File(metadata_path)

iface = gr.Interface(
    fn=process_docx,
    inputs=gr.File(file_count="single", type="filepath", label="Upload DOCX File"),
    outputs=[
        gr.Textbox(label="Output"),
        gr.File(label="Download FAISS Index"),
        gr.File(label="Download Metadata")
    ],
    title="Docx to FAISS & Metadata Generator",
    description="Upload a DOCX file, and it will process the contents to generate FAISS index and metadata."
)

if __name__ == "__main__":
    iface.launch()