Spaces:
Running
Running
File size: 3,962 Bytes
f65750f 2723c4f f65750f 4a97e8c f65750f 4a97e8c f65750f 2723c4f f65750f 4a97e8c f65750f 4a97e8c f65750f 4a97e8c f65750f 4a97e8c f65750f 4a97e8c f65750f 2723c4f f65750f 2723c4f f65750f 2723c4f 4a97e8c f65750f 4a97e8c f65750f 4a97e8c f65750f 7a013c2 f65750f 4a97e8c f65750f 29df71b 4a97e8c 29df71b 4a97e8c 29df71b f65750f 29df71b f65750f 7a013c2 f65750f 4a97e8c f65750f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import json
import time
import os
from pathlib import Path
from typing import Dict, Any, List
import chromadb
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import (
DocumentConverter,
PdfFormatOption,
WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.document import DoclingDocument
from docling.chunking.hierarchical_chunker import HierarchicalChunker
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
class DocumentProcessor:
def __init__(self):
"""Initialize document processor with Docling v2 changes"""
self.setup_document_converter()
self.embed_model = FastEmbedEmbeddings()
self.client = chromadb.PersistentClient(path="chroma_db")
def setup_document_converter(self):
"""Configure document converter to support multiple formats"""
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
self.converter = DocumentConverter(
allowed_formats=[
InputFormat.PDF,
InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
],
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=PyPdfiumDocumentBackend()
),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimplePipeline
),
},
)
def process_document(self, file_path: str):
"""Process document and create searchable index with metadata"""
print(f"π Processing document: {file_path}")
start_time = time.time()
file_ext = Path(file_path).suffix.lower()
try:
conv_result = self.converter.convert(file_path)
doc: DoclingDocument = conv_result.document
except Exception as e:
print(f"β Conversion failed: {e}")
return None
chunker = HierarchicalChunker()
chunks = list(chunker.chunk(doc))
processed_chunks = []
for chunk in chunks:
metadata = {
"text": chunk.text.strip(),
"headings": [item.text for item in chunk.doc_items if hasattr(item, "text")],
"content_type": chunk.doc_items[0].label if chunk.doc_items else "Unknown",
}
processed_chunks.append(metadata)
print("β
Chunking completed. Creating vector database...")
collection = self.client.get_or_create_collection(name="document_chunks")
documents, embeddings, metadata_list, ids = [], [], [], []
for idx, chunk in enumerate(processed_chunks):
text = chunk.get('text', '').strip()
if not text:
continue
embedding = self.embed_model.embed_documents([text])[0]
documents.append(text)
embeddings.append(embedding)
metadata_list.append({
"headings": json.dumps(chunk.get('headings', [])),
"content_type": chunk.get('content_type', None)
})
ids.append(str(idx))
if documents:
collection.add(
ids=ids,
embeddings=embeddings,
documents=documents,
metadatas=metadata_list
)
print(f"β
Successfully added {len(documents)} chunks to the database.")
print(f"β
Document processing completed in {time.time() - start_time:.2f} seconds")
return collection
|