Spaces:
Sleeping
Sleeping
File size: 4,574 Bytes
d5e4624 3155e02 d5e4624 b9b53e9 116f242 d5e4624 b9b53e9 d5e4624 b9b53e9 d5e4624 b9b53e9 d5e4624 44a1aa4 d5e4624 44a1aa4 b9b53e9 d5e4624 b9b53e9 d5e4624 b9b53e9 d5e4624 3155e02 d5e4624 44a1aa4 b9b53e9 d5e4624 3155e02 795d8c0 3155e02 d5e4624 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import os
import faiss
import numpy as np
import json
from docx import Document
from sentence_transformers import SentenceTransformer
import gradio as gr
import tempfile
# ---------- تنظیمات ----------
OUTPUT_DIR = "/tmp/output_faiss" # مسیر ذخیره فایلهای خروجی
EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
# ---------- تبدیل فایل docx به ساختار JSON ----------
def docx_to_sections(docx_path):
doc = Document(docx_path)
sections = []
current_h1 = None
current_h2 = None
buffer = ""
for para in doc.paragraphs:
style = para.style.name
text = para.text.strip()
if not text:
continue
if style.startswith("Heading 1"):
if current_h2:
sections.append({
"heading": current_h2,
"content": buffer.strip(),
"full_content": buffer.strip(),
"parent": current_h1
})
current_h2 = None
buffer = ""
if current_h1 and buffer:
sections.append({
"heading": current_h1,
"content": buffer.strip(),
"full_content": buffer.strip()
})
current_h1 = text
buffer = ""
elif style.startswith("Heading 2"):
if current_h2:
sections.append({
"heading": current_h2,
"content": buffer.strip(),
"full_content": buffer.strip(),
"parent": current_h1
})
current_h2 = text
buffer = ""
else:
buffer += text + "\n"
if current_h2:
sections.append({
"heading": current_h2,
"content": buffer.strip(),
"full_content": buffer.strip(),
"parent": current_h1
})
elif current_h1:
sections.append({
"heading": current_h1,
"content": buffer.strip(),
"full_content": buffer.strip()
})
return sections
# ---------- تولید embedding ----------
def generate_embeddings(sections, model):
texts = [s['content'] for s in sections]
embeddings = model.encode(
texts,
convert_to_numpy=True,
normalize_embeddings=True, # نرمالسازی برای دقت بهتر در FAISS
show_progress_bar=True
)
return embeddings.astype("float32")
# ---------- ذخیره FAISS + متادیتا ----------
def save_faiss_and_metadata(embeddings, sections, base_name):
# استفاده از دایرکتوری موقت
temp_dir = tempfile.mkdtemp()
os.makedirs(temp_dir, exist_ok=True)
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)
faiss_path = os.path.join(temp_dir, f"faiss_index_{base_name}.bin")
metadata_path = os.path.join(temp_dir, f"metadata_{base_name}.json")
faiss.write_index(index, faiss_path)
with open(metadata_path, "w", encoding="utf-8") as f:
json.dump(sections, f, ensure_ascii=False, indent=2)
print(f"✅ ذخیره شد:\n - {faiss_path}\n - {metadata_path}")
return faiss_path, metadata_path
def build_from_docx(docx_file_path):
print(f"📄 پردازش فایل: {docx_file_path}")
sections = docx_to_sections(docx_file_path)
print(f"🧩 {len(sections)} بخش استخراج شد.")
model = SentenceTransformer(EMBEDDING_MODEL_NAME)
embeddings = generate_embeddings(sections, model)
base_name = os.path.splitext(os.path.basename(docx_file_path))[0].lower()
faiss_path, metadata_path = save_faiss_and_metadata(embeddings, sections, base_name)
return f"فایلهای FAISS و متادیتا ایجاد شدند.", faiss_path, metadata_path
def process_docx(file):
message, faiss_path, metadata_path = build_from_docx(file.name)
return message, gr.File(faiss_path), gr.File(metadata_path)
iface = gr.Interface(
fn=process_docx,
inputs=gr.File(file_count="single", type="filepath", label="Upload DOCX File"),
outputs=[
gr.Textbox(label="Output"),
gr.File(label="Download FAISS Index"),
gr.File(label="Download Metadata")
],
title="Docx to FAISS & Metadata Generator",
description="Upload a DOCX file, and it will process the contents to generate FAISS index and metadata."
)
if __name__ == "__main__":
iface.launch()
|