import os import faiss import numpy as np import json from docx import Document from sentence_transformers import SentenceTransformer import gradio as gr import tempfile # ---------- تنظیمات ---------- OUTPUT_DIR = "./output_faiss" # مسیر ذخیره فایل‌های خروجی EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" # ---------- تبدیل فایل docx به ساختار JSON ---------- def docx_to_sections(docx_path): doc = Document(docx_path) sections = [] current_h1 = None current_h2 = None buffer = "" for para in doc.paragraphs: style = para.style.name text = para.text.strip() if not text: continue if style.startswith("Heading 1"): if current_h2: sections.append({ "heading": current_h2, "content": buffer.strip(), "full_content": buffer.strip(), "parent": current_h1 }) current_h2 = None buffer = "" if current_h1 and buffer: sections.append({ "heading": current_h1, "content": buffer.strip(), "full_content": buffer.strip() }) current_h1 = text buffer = "" elif style.startswith("Heading 2"): if current_h2: sections.append({ "heading": current_h2, "content": buffer.strip(), "full_content": buffer.strip(), "parent": current_h1 }) current_h2 = text buffer = "" else: buffer += text + "\n" if current_h2: sections.append({ "heading": current_h2, "content": buffer.strip(), "full_content": buffer.strip(), "parent": current_h1 }) elif current_h1: sections.append({ "heading": current_h1, "content": buffer.strip(), "full_content": buffer.strip() }) return sections # ---------- تولید embedding ---------- def generate_embeddings(sections, model): texts = [s['content'] for s in sections] embeddings = model.encode( texts, convert_to_numpy=True, normalize_embeddings=True, # نرمال‌سازی برای دقت بهتر در FAISS show_progress_bar=True ) return embeddings.astype("float32") # ---------- ذخیره FAISS + متادیتا ---------- def save_faiss_and_metadata(embeddings, sections, base_name): os.makedirs(OUTPUT_DIR, exist_ok=True) d = embeddings.shape[1] index = faiss.IndexFlatL2(d) index.add(embeddings) faiss_path = os.path.join(OUTPUT_DIR, f"faiss_index_{base_name}.bin") metadata_path = os.path.join(OUTPUT_DIR, f"metadata_{base_name}.json") faiss.write_index(index, faiss_path) with open(metadata_path, "w", encoding="utf-8") as f: json.dump(sections, f, ensure_ascii=False, indent=2) print(f"✅ ذخیره شد:\n - {faiss_path}\n - {metadata_path}") # ---------- اجرای کامل ---------- def build_from_docx(docx_file): # ذخیره فایل موقت برای پردازش with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_file: temp_file.write(docx_file.read()) temp_file_path = temp_file.name print(f"📄 پردازش فایل: {temp_file_path}") sections = docx_to_sections(temp_file_path) print(f"🧩 {len(sections)} بخش استخراج شد.") model = SentenceTransformer(EMBEDDING_MODEL_NAME) embeddings = generate_embeddings(sections, model) base_name = os.path.splitext(os.path.basename(temp_file_path))[0].lower() save_faiss_and_metadata(embeddings, sections, base_name) return f"فایل‌های FAISS و متادیتا ایجاد شد و در مسیر {OUTPUT_DIR} ذخیره شدند." # ---------- رابط کاربری Gradio ---------- def process_docx(file): result = build_from_docx(file) return result iface = gr.Interface( fn=process_docx, inputs=gr.File(file_count="single", type="file", label="Upload DOCX File"), outputs=gr.Textbox(label="Output"), title="Docx to FAISS & Metadata Generator", description="Upload a DOCX file, and it will process the contents to generate FAISS index and metadata." ) if __name__ == "__main__": iface.launch()