Update utils.py
Browse files
utils.py
CHANGED
|
@@ -267,34 +267,6 @@ def load_word_with_metadata(file_path):
|
|
| 267 |
return documents
|
| 268 |
|
| 269 |
|
| 270 |
-
"""
|
| 271 |
-
# Custom loader functions
|
| 272 |
-
def load_pdf_with_metadata(file_path):
|
| 273 |
-
document = fitz.open(file_path)
|
| 274 |
-
documents = []
|
| 275 |
-
for page_num in range(len(document)):
|
| 276 |
-
page = document.load_page(page_num)
|
| 277 |
-
content = page.get_text("text")
|
| 278 |
-
metadata = {
|
| 279 |
-
"title": document.metadata.get("title", "Unbekannt"),
|
| 280 |
-
"page": page_num + 1,
|
| 281 |
-
"path": file_path
|
| 282 |
-
}
|
| 283 |
-
documents.append({"page_content": content, "metadata": metadata})
|
| 284 |
-
return documents
|
| 285 |
-
|
| 286 |
-
def load_word_with_metadata(file_path):
|
| 287 |
-
document = docx.Document(file_path)
|
| 288 |
-
metadata = {
|
| 289 |
-
"title": "Dokument",
|
| 290 |
-
"path": file_path
|
| 291 |
-
}
|
| 292 |
-
contents = []
|
| 293 |
-
for para in document.paragraphs:
|
| 294 |
-
content = para.text
|
| 295 |
-
contents.append({"page_content": content, "metadata": {**metadata, "page": 1}})
|
| 296 |
-
return contents
|
| 297 |
-
"""
|
| 298 |
|
| 299 |
|
| 300 |
################################################
|
|
@@ -362,34 +334,7 @@ def document_retrieval_chroma(llm, prompt):
|
|
| 362 |
return db
|
| 363 |
|
| 364 |
|
| 365 |
-
############################################
|
| 366 |
-
# rag_chain Alternative für RAg mit Bild-Upload, da hier das llm so nicht genutzt werden kann und der prompt mit den RAG Erweiterungen anders übergeben wird
|
| 367 |
-
#langchain nutzen, um prompt an llm zu leiten, aber vorher in der VektorDB suchen, um passende splits zum Prompt hinzuzufügen
|
| 368 |
-
#prompt mit RAG!!!
|
| 369 |
-
"""
|
| 370 |
-
def rag_chainback(prompt, db, k=3):
|
| 371 |
-
rag_template = "Nutze ausschließlich die folgenden Kontext Teile am Ende, um die Frage zu beantworten . " + template + "Frage: " + prompt + "Kontext Teile: "
|
| 372 |
-
retrieved_chunks = db.similarity_search(prompt, k)
|
| 373 |
-
|
| 374 |
-
# Erstelle ein Dictionary für die Chunks
|
| 375 |
-
chunks_dict = []
|
| 376 |
-
for i, chunk in enumerate(retrieved_chunks):
|
| 377 |
-
chunk_dict = {
|
| 378 |
-
"chunk_index": i + 1,
|
| 379 |
-
"page_content": chunk.page_content, # assuming chunk has page_content attribute
|
| 380 |
-
"metadata": chunk.metadata # assuming chunk has metadata attribute
|
| 381 |
-
}
|
| 382 |
-
chunks_dict.append(chunk_dict)
|
| 383 |
-
|
| 384 |
-
# Erstelle das neue Prompt
|
| 385 |
-
neu_prompt = rag_template
|
| 386 |
-
for chunk in chunks_dict:
|
| 387 |
-
neu_prompt += f"{chunk['chunk_index']}. {chunk['page_content']}\n"
|
| 388 |
|
| 389 |
-
print("dict.............................."+ json.dumps(chunks_dict, indent=4, ensure_ascii=False))
|
| 390 |
-
|
| 391 |
-
return neu_prompt, chunks_dict # returning both the new prompt and the dictionary
|
| 392 |
-
"""
|
| 393 |
|
| 394 |
###############################################
|
| 395 |
#Langchain anlegen für RAG Chaining
|
|
@@ -414,10 +359,7 @@ def rag_chain(llm, prompt, retriever):
|
|
| 414 |
relevant_docs = retriever.get_relevant_documents(prompt)
|
| 415 |
extracted_docs = extract_document_info(relevant_docs)
|
| 416 |
|
| 417 |
-
|
| 418 |
-
print("releant docs1......................")
|
| 419 |
if (len(extracted_docs)>0):
|
| 420 |
-
print("releant docs2......................")
|
| 421 |
print(extracted_docs)
|
| 422 |
#llm_chain = LLMChain(llm = llm, prompt = RAG_CHAIN_PROMPT)
|
| 423 |
#result = llm_chain.run({"context": relevant_docs, "question": prompt})
|
|
@@ -476,23 +418,6 @@ def extract_document_info(documents):
|
|
| 476 |
}
|
| 477 |
extracted_info.append(info)
|
| 478 |
return extracted_info
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
"""
|
| 482 |
-
# Funktion zum Erstellen der Liste von Dictionaries
|
| 483 |
-
def extract_document_info(documents):
|
| 484 |
-
extracted_info = []
|
| 485 |
-
for doc in documents:
|
| 486 |
-
info = {
|
| 487 |
-
'content' : doc["content"],
|
| 488 |
-
'metadaten' : doc["metadata"],
|
| 489 |
-
'titel' : metadaten.get("title", "Keine Überschrift"),
|
| 490 |
-
'seite' : metadaten.get("page", "Unbekannte Seite"),
|
| 491 |
-
'pfad' : metadaten.get("path", "Kein Pfad verfügbar")
|
| 492 |
-
}
|
| 493 |
-
extracted_info.append(info)
|
| 494 |
-
return extracted_info
|
| 495 |
-
"""
|
| 496 |
|
| 497 |
|
| 498 |
|
|
|
|
| 267 |
return documents
|
| 268 |
|
| 269 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
|
| 271 |
|
| 272 |
################################################
|
|
|
|
| 334 |
return db
|
| 335 |
|
| 336 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
|
| 339 |
###############################################
|
| 340 |
#Langchain anlegen für RAG Chaining
|
|
|
|
| 359 |
relevant_docs = retriever.get_relevant_documents(prompt)
|
| 360 |
extracted_docs = extract_document_info(relevant_docs)
|
| 361 |
|
|
|
|
|
|
|
| 362 |
if (len(extracted_docs)>0):
|
|
|
|
| 363 |
print(extracted_docs)
|
| 364 |
#llm_chain = LLMChain(llm = llm, prompt = RAG_CHAIN_PROMPT)
|
| 365 |
#result = llm_chain.run({"context": relevant_docs, "question": prompt})
|
|
|
|
| 418 |
}
|
| 419 |
extracted_info.append(info)
|
| 420 |
return extracted_info
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
|
| 422 |
|
| 423 |
|