import os import faiss import numpy as np import json from docx import Document from sentence_transformers import SentenceTransformer import gradio as gr import tempfile # ---------- تنظیمات ---------- OUTPUT_DIR = "/tmp/output_faiss" # مسیر ذخیره فایل‌های خروجی EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" # ---------- تبدیل فایل docx به ساختار JSON ---------- def docx_to_sections(docx_path): doc = Document(docx_path) sections = [] current_h1 = None current_h2 = None buffer = "" for para in doc.paragraphs: style = para.style.name text = para.text.strip() if not text: continue if style.startswith("Heading 1"): if current_h2: sections.append({ "heading": current_h2, "content": buffer.strip(), "full_content": buffer.strip(), "parent": current_h1 }) current_h2 = None buffer = "" if current_h1 and buffer: sections.append({ "heading": current_h1, "content": buffer.strip(), "full_content": buffer.strip() }) current_h1 = text buffer = "" elif style.startswith("Heading 2"): if current_h2: sections.append({ "heading": current_h2, "content": buffer.strip(), "full_content": buffer.strip(), "parent": current_h1 }) current_h2 = text buffer = "" else: buffer += text + "\n" if current_h2: sections.append({ "heading": current_h2, "content": buffer.strip(), "full_content": buffer.strip(), "parent": current_h1 }) elif current_h1: sections.append({ "heading": current_h1, "content": buffer.strip(), "full_content": buffer.strip() }) return sections # ---------- تولید embedding ---------- def generate_embeddings(sections, model): texts = [s['content'] for s in sections] embeddings = model.encode( texts, convert_to_numpy=True, normalize_embeddings=True, # نرمال‌سازی برای دقت بهتر در FAISS show_progress_bar=True ) return embeddings.astype("float32") # ---------- ذخیره FAISS + متادیتا ---------- def save_faiss_and_metadata(embeddings, sections, base_name): # استفاده از دایرکتوری موقت temp_dir = tempfile.mkdtemp() os.makedirs(temp_dir, exist_ok=True) d = embeddings.shape[1] index = faiss.IndexFlatL2(d) index.add(embeddings) faiss_path = os.path.join(temp_dir, f"faiss_index_{base_name}.bin") metadata_path = os.path.join(temp_dir, f"metadata_{base_name}.json") faiss.write_index(index, faiss_path) with open(metadata_path, "w", encoding="utf-8") as f: json.dump(sections, f, ensure_ascii=False, indent=2) print(f"✅ ذخیره شد:\n - {faiss_path}\n - {metadata_path}") return faiss_path, metadata_path def build_from_docx(docx_file_path): print(f"📄 پردازش فایل: {docx_file_path}") sections = docx_to_sections(docx_file_path) print(f"🧩 {len(sections)} بخش استخراج شد.") model = SentenceTransformer(EMBEDDING_MODEL_NAME) embeddings = generate_embeddings(sections, model) base_name = os.path.splitext(os.path.basename(docx_file_path))[0].lower() faiss_path, metadata_path = save_faiss_and_metadata(embeddings, sections, base_name) return f"فایل‌های FAISS و متادیتا ایجاد شدند.", faiss_path, metadata_path def process_docx(file): message, faiss_path, metadata_path = build_from_docx(file.name) return message, gr.File(faiss_path), gr.File(metadata_path) iface = gr.Interface( fn=process_docx, inputs=gr.File(file_count="single", type="filepath", label="Upload DOCX File"), outputs=[ gr.Textbox(label="Output"), gr.File(label="Download FAISS Index"), gr.File(label="Download Metadata") ], title="Docx to FAISS & Metadata Generator", description="Upload a DOCX file, and it will process the contents to generate FAISS index and metadata." ) if __name__ == "__main__": iface.launch()