Spaces:
Running
Running
File size: 6,763 Bytes
756fca0 78209bc d07865c d8410b4 eebeb78 d07865c f490f11 78209bc 756fca0 78209bc 756fca0 12d3e1a 756fca0 12d3e1a 78209bc d8410b4 f490f11 d8410b4 f490f11 d8410b4 f490f11 d8410b4 f490f11 78209bc d8410b4 c5586ab 78209bc f490f11 78209bc c5586ab de78af1 78209bc c5586ab 78209bc de78af1 78209bc d07865c 7757039 c6dbb49 d07865c c6dbb49 d07865c 096dfd3 d07865c 93c6cb3 d8410b4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
from typing import Any, List, Tuple, Union
from langchain_core.documents import Document
from langchain_core.messages import HumanMessage
from _utils.gerar_relatorio_modelo_usuario.llm_calls import agemini_answer
from _utils.langchain_utils.Splitter_class import Splitter
from _utils.langchain_utils.LLM_class import LLM
from _utils.gerar_relatorio_modelo_usuario.prompts import (
create_prompt_auxiliar_do_contextual_prompt,
)
import re
from gerar_documento.serializer import GerarDocumentoSerializerData
def gerar_resposta_compilada(serializer: Union[GerarDocumentoSerializerData, Any]):
return {
"num_chunks_retrieval": serializer.num_chunks_retrieval,
"embedding_weight": serializer.embedding_weight,
"bm25_weight": serializer.bm25_weight,
"context_window": serializer.context_window,
"chunk_overlap": serializer.chunk_overlap,
"num_k_rerank": serializer.num_k_rerank,
"model_cohere_rerank": serializer.model_cohere_rerank,
"more_initial_chunks_for_reranking": serializer.more_initial_chunks_for_reranking,
"claude_context_model": serializer.claude_context_model,
"gpt_temperature": serializer.gpt_temperature,
"user_message": serializer.user_message,
"model": serializer.model,
"hf_embedding": serializer.hf_embedding,
"chunk_size": serializer.chunk_size,
"chunk_overlap": serializer.chunk_overlap,
# "prompt_auxiliar": serializer.prompt_auxiliar,
"prompt_gerar_documento": serializer.prompt_gerar_documento[0:200],
}
def check_regex_patterns(context: str, lista_de_document_ids: List[int]):
patterns = [
# r"\[*([\d.\-]+)\]*\s*---\s*\[*([^]]+)\]*\s*---\s*\[*([^]]+)\]*\s*</chunk_context>", # PRIMEIRO DE TODOS
# r"<chunk_context>\s*([\d.\-]+)\s*---\s*([^<]+)\s*---\s*([^<]+)\s*</chunk_context>",
r"<chunk_context>\s*(\d+)(?:\s*-\s*Pág\.\s*\d+)?\s*---\s*([^-\n]+)\s*---\s*([^<]+)</chunk_context>",
r"<chunk_context>\s*(?:\[*([\d]+)\]*\s*[-–]*\s*(?:Pág\.\s*\d+\s*[-–]*)?)?\s*\[*([^\]]+)\]*\s*[-–]*\s*\[*([^\]]+)\]*\s*[-–]*\s*\[*([^\]]+)\]*\s*</chunk_context>",
# r"\[([\d.\-]+)\]\s*---\s*\[([^]]+)\]\s*---\s*\[([^]]+)\]\s*</chunk_context>",
# r"<chunk_context>\s*\[?([\d.\-]+)\]?\s*---\s*\[?([^\]\[]+?)\]?\s*---\s*\[?([^<]+?)\]?\s*</chunk_context>",
# r"<chunk_context>\s*\[([\d.\-]+)\]\s*---\s*\[([^\]]+)\]\s*---\s*\[([^\]]+)\]\s*</chunk_context>"
# r"<chunk_context>\s*\[?([\d.\-\s]+)\]?\s*---\s*\[?([^\]\[]+?)\]?\s*---\s*\[?([\s\S]+?)\]?\s*</chunk_context>",
]
for pattern in patterns:
matches = re.findall(pattern, context, re.DOTALL)
if len(matches) == len(lista_de_document_ids):
print("\n--------------- REGEX DO CONTEXTUAL FUNCIONOU")
break
return matches
def validate_many_chunks_in_one_request(
response: str, lista_de_document_ids: List[int]
):
context = (
response.replace("document_id: ", "")
.replace("document_id:", "")
.replace("DOCUMENT_ID: ", "")
.replace("DOCUMENT_ID: ", "")
)
# pattern = r"\[(\d+|[-.]+)\] --- (.+?) --- (.+?)</chunk_context>" # Funciona para quando a resposta do LLM não vem com "document_id" escrito
matches = check_regex_patterns(context, lista_de_document_ids)
matches_as_list = []
for index, match in enumerate(list(matches)):
if index >= 20:
break
resultado = match[0].replace(".", "").replace("-", "")
resultado = lista_de_document_ids[index]
matches_as_list.append((resultado, match[1], match[2]))
if len(matches) == 0:
print(
"----------- ERROU NA TENTATIVA ATUAL DE FORMATAR O CONTEXTUAL -----------"
)
return False
return matches_as_list
# Esta função gera a resposta que será usada em cada um das requisições de cada chunk
async def get_response_from_auxiliar_contextual_prompt(full_text_as_array: List[str]):
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")
llms = LLM()
responses = []
current_chunk = []
current_token_count = 0
chunk_counter = 1
for part in full_text_as_array:
part_tokens = len(encoding.encode(part))
# Check if adding this part would EXCEED the limit
if current_token_count + part_tokens > 600000:
# Process the accumulated chunk before it exceeds the limit
chunk_text = "".join(current_chunk)
print(
f"\nProcessing chunk {chunk_counter} with {current_token_count} tokens"
)
prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
response = await llms.google_gemini().ainvoke(
[HumanMessage(content=prompt)]
)
responses.append(response.content)
# Start new chunk with current part
current_chunk = [part]
current_token_count = part_tokens
chunk_counter += 1
else:
# Safe to add to current chunk
current_chunk.append(part)
current_token_count += part_tokens
# Process the final remaining chunk
if current_chunk:
chunk_text = "".join(current_chunk)
print(
f"\nProcessing final chunk {chunk_counter} with {current_token_count} tokens"
)
prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
response = await llms.google_gemini().ainvoke([HumanMessage(content=prompt)])
responses.append(response.content)
return "".join(responses)
async def get_full_text_and_all_PDFs_chunks(
listaPDFs: List[str],
splitterObject: Splitter,
should_use_llama_parse: bool,
isBubble: bool,
):
all_PDFs_chunks = []
pages: List[str] = []
# Load and process document
for pdf_path in listaPDFs:
chunks, pages = await splitterObject.load_and_split_document(
pdf_path, should_use_llama_parse, isBubble
)
all_PDFs_chunks = all_PDFs_chunks + chunks
return all_PDFs_chunks, pages
async def generate_document_title(resumo_para_gerar_titulo: str):
prompt = f"Você é um assistente jurídico e irá receber abaixo o resumo de um documento jurídico. Quero que você gere um título para este documento. Mande como resposta apenas o título gerado, nada mais. Aqui está um título de exemplo pra você se basear ao criar um novo: <titulo_de_exemplo>Ação Penal por Furto Qualificado nº 0002269-86.2009.805.0032<titulo_de_exemplo>\n\nSegue abaixo o resumo do documento jurídico:\n{resumo_para_gerar_titulo}"
response = await agemini_answer(prompt, "gemini-2.0-flash-lite")
return response
|