Spaces:
Sleeping
Sleeping
File size: 3,649 Bytes
b2437ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import argparse
import os
import shutil
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from get_embedding_function import get_embedding_function
from langchain_chroma import Chroma
CHROMA_ROOT_PATH = "chroma"
DATA_ROOT_PATH = "pdf"
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--reset", action="store_true", help="Reset the databases.")
args = parser.parse_args()
if args.reset:
print("Clearing all Chromas")
clear_all_databases()
for city_folder in os.listdir(DATA_ROOT_PATH):
city_path = os.path.join(DATA_ROOT_PATH, city_folder)
if os.path.isdir(city_path):
print(f"🔄 Processando a cidade: {city_folder}")
process_city(city_folder, city_path)
def process_city(city_name: str, city_path: str):
"""
Processa uma subpasta de cidade, criando ou atualizando o Chroma correspondente.
"""
chroma_city_path = os.path.join(CHROMA_ROOT_PATH, f"{city_name}")
documents = load_documents(city_path)
chunks = split_documents(documents)
add_to_chroma(chunks, chroma_city_path)
def load_documents(city_path: str):
"""
Carrega documentos PDF da subpasta de uma cidade.
"""
document_loader = PyPDFDirectoryLoader(city_path)
return document_loader.load()
def split_documents(documents: list[Document]):
"""
Divide os documentos em chunks menores.
"""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=80,
length_function=len,
is_separator_regex=False,
)
return text_splitter.split_documents(documents)
def add_to_chroma(chunks: list[Document], chroma_path: str):
"""
Adiciona ou atualiza os documentos no Chroma específico da cidade.
"""
db = Chroma(
persist_directory=chroma_path, embedding_function=get_embedding_function()
)
chunks_with_ids = calculate_chunk_ids(chunks)
existing_items = db.get(include=[])
existing_ids = set(existing_items["ids"])
print(f"Número de documentos no banco de dados '{chroma_path}': {len(existing_ids)}")
new_chunks = [chunk for chunk in chunks_with_ids if chunk.metadata["id"] not in existing_ids]
if len(new_chunks):
print(f"👉 Adicionando {len(new_chunks)} novo(s) documento(s) ao banco '{chroma_path}'")
new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
db.add_documents(new_chunks, ids=new_chunk_ids)
else:
print(f"✅ Nenhum novo documento para adicionar ao banco '{chroma_path}'")
def calculate_chunk_ids(chunks):
"""
Calcula IDs únicos para cada chunk com base na fonte e na página.
"""
last_page_id = None
current_chunk_index = 0
for chunk in chunks:
source = chunk.metadata.get("source")
page = chunk.metadata.get("page")
current_page_id = f"{source}:{page}"
if current_page_id == last_page_id:
current_chunk_index += 1
else:
current_chunk_index = 0
chunk_id = f"{current_page_id}:{current_chunk_index}"
last_page_id = current_page_id
chunk.metadata["id"] = chunk_id
return chunks
def clear_all_databases():
"""
Remove todos os bancos de dados Chroma existentes.
"""
if os.path.exists(CHROMA_ROOT_PATH):
shutil.rmtree(CHROMA_ROOT_PATH)
if __name__ == "__main__":
main()
|