Spaces:
Running
Running
luanpoppe
commited on
Commit
·
3143cff
1
Parent(s):
ca8a144
feat: adicionando busca pelo modelo de um usuário para entrar no system prompt final
Browse files- _utils/resumo_completo_cursor.py +29 -16
- resumos/serializer.py +4 -6
- setup/environment.py +3 -1
_utils/resumo_completo_cursor.py
CHANGED
|
@@ -16,6 +16,9 @@ import numpy as np
|
|
| 16 |
from rank_bm25 import BM25Okapi
|
| 17 |
import logging
|
| 18 |
from cohere import Client
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
def reciprocal_rank_fusion(result_lists, weights=None):
|
| 21 |
"""Combine multiple ranked lists using reciprocal rank fusion"""
|
|
@@ -85,21 +88,20 @@ class DocumentSummarizer:
|
|
| 85 |
def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
|
| 86 |
"""Load PDF and split into chunks with metadata"""
|
| 87 |
loader = PyPDFLoader(pdf_path)
|
| 88 |
-
pages = loader.load()
|
| 89 |
chunks = []
|
| 90 |
char_count = 0
|
| 91 |
|
| 92 |
for page in pages:
|
| 93 |
text = page.page_content
|
| 94 |
-
#
|
| 95 |
-
page_chunks = self.text_splitter.split_text(text)
|
| 96 |
|
| 97 |
for chunk in page_chunks:
|
| 98 |
chunk_id = str(uuid.uuid4())
|
| 99 |
-
start_char = text.find(chunk)
|
| 100 |
end_char = start_char + len(chunk)
|
| 101 |
|
| 102 |
-
doc_chunk = DocumentChunk(
|
| 103 |
content=chunk,
|
| 104 |
page_number=page.metadata.get('page') + 1, # 1-based page numbering
|
| 105 |
chunk_id=chunk_id,
|
|
@@ -119,7 +121,7 @@ class DocumentSummarizer:
|
|
| 119 |
|
| 120 |
return chunks
|
| 121 |
|
| 122 |
-
def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma:
|
| 123 |
"""Create vector store with metadata"""
|
| 124 |
texts = [chunk.content for chunk in chunks]
|
| 125 |
metadatas = [{
|
|
@@ -136,7 +138,7 @@ class DocumentSummarizer:
|
|
| 136 |
)
|
| 137 |
return vector_store
|
| 138 |
|
| 139 |
-
def rerank_chunks(
|
| 140 |
self,
|
| 141 |
chunks: List[Dict],
|
| 142 |
query: str,
|
|
@@ -180,7 +182,7 @@ class DocumentSummarizer:
|
|
| 180 |
logging.error(f"Reranking failed: {str(e)}")
|
| 181 |
return chunks[:k] # Fallback to original ordering
|
| 182 |
|
| 183 |
-
def generate_summary_with_sources(
|
| 184 |
self,
|
| 185 |
vector_store: Chroma,
|
| 186 |
query: str = "Summarize the main points of this document"
|
|
@@ -256,7 +258,7 @@ class DocumentSummarizer:
|
|
| 256 |
|
| 257 |
return structured_output
|
| 258 |
|
| 259 |
-
def get_source_context(self, chunk_id: str, window: int = 100) -> Dict:
|
| 260 |
"""Get extended context around a specific chunk"""
|
| 261 |
metadata = self.chunk_metadata.get(chunk_id)
|
| 262 |
if not metadata:
|
|
@@ -270,7 +272,7 @@ class DocumentSummarizer:
|
|
| 270 |
|
| 271 |
class ContextualRetriever:
|
| 272 |
def __init__(self, config: RetrievalConfig, claude_api_key: str, claude_context_model):
|
| 273 |
-
self.config = config
|
| 274 |
self.claude_client = Anthropic(api_key=claude_api_key)
|
| 275 |
self.logger = logging.getLogger(__name__)
|
| 276 |
self.bm25 = None
|
|
@@ -293,12 +295,12 @@ class ContextualRetriever:
|
|
| 293 |
max_tokens=100,
|
| 294 |
messages=[{"role": "user", "content": prompt}]
|
| 295 |
)
|
| 296 |
-
return response.content[0].text
|
| 297 |
except Exception as e:
|
| 298 |
self.logger.error(f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}")
|
| 299 |
return ""
|
| 300 |
|
| 301 |
-
def contextualize_chunks(self, full_text: str, chunks: List[DocumentChunk]) -> List[ContextualizedChunk]:
|
| 302 |
"""Add context to all chunks"""
|
| 303 |
contextualized_chunks = []
|
| 304 |
for chunk in chunks:
|
|
@@ -315,7 +317,7 @@ class ContextualRetriever:
|
|
| 315 |
return contextualized_chunks
|
| 316 |
|
| 317 |
class EnhancedDocumentSummarizer(DocumentSummarizer):
|
| 318 |
-
def __init__(self, openai_api_key: str, claude_api_key: str, config: RetrievalConfig, embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank, claude_context_model, system_prompt, gpt_model, gpt_temperature):
|
| 319 |
super().__init__(openai_api_key, os.environ.get("COHERE_API_KEY"), embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank)
|
| 320 |
self.config = config
|
| 321 |
self.contextual_retriever = ContextualRetriever(config, claude_api_key, claude_context_model)
|
|
@@ -323,6 +325,7 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
|
|
| 323 |
self.system_prompt = system_prompt
|
| 324 |
self.gpt_model = gpt_model
|
| 325 |
self.gpt_temperature = gpt_temperature
|
|
|
|
| 326 |
|
| 327 |
def create_enhanced_vector_store(self, chunks: List[ContextualizedChunk]) -> Tuple[Chroma, BM25Okapi, List[str]]:
|
| 328 |
"""Create vector store and BM25 index with contextualized chunks"""
|
|
@@ -453,18 +456,27 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
|
|
| 453 |
|
| 454 |
prompt_template = self.system_prompt
|
| 455 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
prompt = PromptTemplate(
|
| 457 |
template=prompt_template,
|
| 458 |
-
input_variables=["context"]
|
| 459 |
)
|
| 460 |
|
| 461 |
llm = ChatOpenAI(
|
| 462 |
temperature=self.gpt_temperature,
|
| 463 |
model_name=self.gpt_model,
|
| 464 |
api_key=self.openai_api_key,
|
|
|
|
| 465 |
)
|
| 466 |
|
| 467 |
-
response = llm.predict(prompt.format(context="\n\n".join(contexts)))
|
| 468 |
|
| 469 |
# Split the response into paragraphs
|
| 470 |
summaries = [p.strip() for p in response.split('\n\n') if p.strip()]
|
|
@@ -515,7 +527,8 @@ def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs):
|
|
| 515 |
claude_context_model=serializer["claude_context_model"],
|
| 516 |
system_prompt=serializer["system_prompt"],
|
| 517 |
gpt_model=serializer["model"],
|
| 518 |
-
gpt_temperature=serializer["gpt_temperature"]
|
|
|
|
| 519 |
)
|
| 520 |
|
| 521 |
# # Load and process document
|
|
|
|
| 16 |
from rank_bm25 import BM25Okapi
|
| 17 |
import logging
|
| 18 |
from cohere import Client
|
| 19 |
+
import requests
|
| 20 |
+
from setup.environment import api_url
|
| 21 |
+
from rest_framework.response import Response
|
| 22 |
|
| 23 |
def reciprocal_rank_fusion(result_lists, weights=None):
|
| 24 |
"""Combine multiple ranked lists using reciprocal rank fusion"""
|
|
|
|
| 88 |
def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
|
| 89 |
"""Load PDF and split into chunks with metadata"""
|
| 90 |
loader = PyPDFLoader(pdf_path)
|
| 91 |
+
pages = loader.load() # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
|
| 92 |
chunks = []
|
| 93 |
char_count = 0
|
| 94 |
|
| 95 |
for page in pages:
|
| 96 |
text = page.page_content
|
| 97 |
+
page_chunks = self.text_splitter.split_text(text) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
|
|
|
|
| 98 |
|
| 99 |
for chunk in page_chunks:
|
| 100 |
chunk_id = str(uuid.uuid4())
|
| 101 |
+
start_char = text.find(chunk) # Retorna a posição onde se encontra o chunk dentro da página inteira
|
| 102 |
end_char = start_char + len(chunk)
|
| 103 |
|
| 104 |
+
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
|
| 105 |
content=chunk,
|
| 106 |
page_number=page.metadata.get('page') + 1, # 1-based page numbering
|
| 107 |
chunk_id=chunk_id,
|
|
|
|
| 121 |
|
| 122 |
return chunks
|
| 123 |
|
| 124 |
+
def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma: # Esta função nunca está sendo utilizada
|
| 125 |
"""Create vector store with metadata"""
|
| 126 |
texts = [chunk.content for chunk in chunks]
|
| 127 |
metadatas = [{
|
|
|
|
| 138 |
)
|
| 139 |
return vector_store
|
| 140 |
|
| 141 |
+
def rerank_chunks( # Esta função nunca está sendo utilizada
|
| 142 |
self,
|
| 143 |
chunks: List[Dict],
|
| 144 |
query: str,
|
|
|
|
| 182 |
logging.error(f"Reranking failed: {str(e)}")
|
| 183 |
return chunks[:k] # Fallback to original ordering
|
| 184 |
|
| 185 |
+
def generate_summary_with_sources( # Esta função nunca está sendo utilizada
|
| 186 |
self,
|
| 187 |
vector_store: Chroma,
|
| 188 |
query: str = "Summarize the main points of this document"
|
|
|
|
| 258 |
|
| 259 |
return structured_output
|
| 260 |
|
| 261 |
+
def get_source_context(self, chunk_id: str, window: int = 100) -> Dict: # Esta função nunca está sendo utilizada
|
| 262 |
"""Get extended context around a specific chunk"""
|
| 263 |
metadata = self.chunk_metadata.get(chunk_id)
|
| 264 |
if not metadata:
|
|
|
|
| 272 |
|
| 273 |
class ContextualRetriever:
|
| 274 |
def __init__(self, config: RetrievalConfig, claude_api_key: str, claude_context_model):
|
| 275 |
+
self.config = config # Este self.config no momento não está sendo utilizada para nada dentro desta classe. Analisar se deveria estar sendo utilizada.
|
| 276 |
self.claude_client = Anthropic(api_key=claude_api_key)
|
| 277 |
self.logger = logging.getLogger(__name__)
|
| 278 |
self.bm25 = None
|
|
|
|
| 295 |
max_tokens=100,
|
| 296 |
messages=[{"role": "user", "content": prompt}]
|
| 297 |
)
|
| 298 |
+
return response.content[0].text # O response.content é uma lista pois é passada uma lista de mensagens, e também retornado uma lista de mensagens, sendo a primeira a mais recente, que é a resposta do model
|
| 299 |
except Exception as e:
|
| 300 |
self.logger.error(f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}")
|
| 301 |
return ""
|
| 302 |
|
| 303 |
+
def contextualize_chunks(self, full_text: str, chunks: List[DocumentChunk]) -> List[ContextualizedChunk]: # Pega um chunk e apenas adiciona uma propriedade de contexto a ela, sendo esta propriedade a resposta da função acima, que chama um Model do Claude para dizer o contexto de um chunk
|
| 304 |
"""Add context to all chunks"""
|
| 305 |
contextualized_chunks = []
|
| 306 |
for chunk in chunks:
|
|
|
|
| 317 |
return contextualized_chunks
|
| 318 |
|
| 319 |
class EnhancedDocumentSummarizer(DocumentSummarizer):
|
| 320 |
+
def __init__(self, openai_api_key: str, claude_api_key: str, config: RetrievalConfig, embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank, claude_context_model, system_prompt, gpt_model, gpt_temperature, id_modelo_do_usuario):
|
| 321 |
super().__init__(openai_api_key, os.environ.get("COHERE_API_KEY"), embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank)
|
| 322 |
self.config = config
|
| 323 |
self.contextual_retriever = ContextualRetriever(config, claude_api_key, claude_context_model)
|
|
|
|
| 325 |
self.system_prompt = system_prompt
|
| 326 |
self.gpt_model = gpt_model
|
| 327 |
self.gpt_temperature = gpt_temperature
|
| 328 |
+
self.id_modelo_do_usuario = id_modelo_do_usuario
|
| 329 |
|
| 330 |
def create_enhanced_vector_store(self, chunks: List[ContextualizedChunk]) -> Tuple[Chroma, BM25Okapi, List[str]]:
|
| 331 |
"""Create vector store and BM25 index with contextualized chunks"""
|
|
|
|
| 456 |
|
| 457 |
prompt_template = self.system_prompt
|
| 458 |
|
| 459 |
+
url_request = f"{api_url}/modelo/{self.id_modelo_do_usuario}"
|
| 460 |
+
resposta = requests.get(url_request)
|
| 461 |
+
|
| 462 |
+
if (resposta.status_code != 200):
|
| 463 |
+
return Response({"error": "Ocorreu um problema. Pode ser que o modelo não tenha sido encontrado. Tente novamente e/ou entre em contato com a equipe técnica"})
|
| 464 |
+
|
| 465 |
+
modelo_buscado = resposta.json()["modelo"]
|
| 466 |
+
|
| 467 |
prompt = PromptTemplate(
|
| 468 |
template=prompt_template,
|
| 469 |
+
input_variables=["context", "modelo_usuario"]
|
| 470 |
)
|
| 471 |
|
| 472 |
llm = ChatOpenAI(
|
| 473 |
temperature=self.gpt_temperature,
|
| 474 |
model_name=self.gpt_model,
|
| 475 |
api_key=self.openai_api_key,
|
| 476 |
+
|
| 477 |
)
|
| 478 |
|
| 479 |
+
response = llm.predict(prompt.format(context="\n\n".join(contexts), modelo_usuario=modelo_buscado))
|
| 480 |
|
| 481 |
# Split the response into paragraphs
|
| 482 |
summaries = [p.strip() for p in response.split('\n\n') if p.strip()]
|
|
|
|
| 527 |
claude_context_model=serializer["claude_context_model"],
|
| 528 |
system_prompt=serializer["system_prompt"],
|
| 529 |
gpt_model=serializer["model"],
|
| 530 |
+
gpt_temperature=serializer["gpt_temperature"],
|
| 531 |
+
id_modelo_do_usuario=serializer["id_modelo_do_usuario"]
|
| 532 |
)
|
| 533 |
|
| 534 |
# # Load and process document
|
resumos/serializer.py
CHANGED
|
@@ -37,17 +37,14 @@ system_prompt = """
|
|
| 37 |
|
| 38 |
Context: {context}
|
| 39 |
|
|
|
|
|
|
|
| 40 |
Key points:
|
| 41 |
"""
|
| 42 |
user_message = "What are the main points of this document?"
|
| 43 |
class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
|
| 44 |
-
# files = serializers.ListField(child=serializers.FileField(), required=True)
|
| 45 |
system_prompt = serializers.CharField(required=False, default=system_prompt)
|
| 46 |
user_message = serializers.CharField(required=False, default=user_message)
|
| 47 |
-
# model = serializers.CharField(required=False, default=default_model)
|
| 48 |
-
# hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
|
| 49 |
-
# chunk_size = serializers.IntegerField(required=False, default=1000)
|
| 50 |
-
# chunk_overlap = serializers.IntegerField(required=False, default=200)
|
| 51 |
num_chunks_retrieval = serializers.IntegerField(default=5)
|
| 52 |
embedding_weight = serializers.FloatField(default=0.5)
|
| 53 |
bm25_weight = serializers.FloatField(default=0.5)
|
|
@@ -57,4 +54,5 @@ class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
|
|
| 57 |
model_cohere_rerank = serializers.CharField(required=False, default="rerank-english-v2.0")
|
| 58 |
more_initial_chunks_for_reranking = serializers.IntegerField(default=20)
|
| 59 |
claude_context_model = serializers.CharField(required=False, default="claude-3-haiku-20240307")
|
| 60 |
-
gpt_temperature = serializers.FloatField(default=0)
|
|
|
|
|
|
| 37 |
|
| 38 |
Context: {context}
|
| 39 |
|
| 40 |
+
Modelo do usuário: {modelo_usuario}
|
| 41 |
+
|
| 42 |
Key points:
|
| 43 |
"""
|
| 44 |
user_message = "What are the main points of this document?"
|
| 45 |
class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
|
|
|
|
| 46 |
system_prompt = serializers.CharField(required=False, default=system_prompt)
|
| 47 |
user_message = serializers.CharField(required=False, default=user_message)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
num_chunks_retrieval = serializers.IntegerField(default=5)
|
| 49 |
embedding_weight = serializers.FloatField(default=0.5)
|
| 50 |
bm25_weight = serializers.FloatField(default=0.5)
|
|
|
|
| 54 |
model_cohere_rerank = serializers.CharField(required=False, default="rerank-english-v2.0")
|
| 55 |
more_initial_chunks_for_reranking = serializers.IntegerField(default=20)
|
| 56 |
claude_context_model = serializers.CharField(required=False, default="claude-3-haiku-20240307")
|
| 57 |
+
gpt_temperature = serializers.FloatField(default=0)
|
| 58 |
+
id_modelo_do_usuario = serializers.IntegerField(required=True)
|
setup/environment.py
CHANGED
|
@@ -1,2 +1,4 @@
|
|
| 1 |
default_model = "gpt-4o-mini"
|
| 2 |
-
# default_model = "gpt-4o"
|
|
|
|
|
|
|
|
|
| 1 |
default_model = "gpt-4o-mini"
|
| 2 |
+
# default_model = "gpt-4o"
|
| 3 |
+
|
| 4 |
+
api_url = "https://luanpoppe-vella-backend.hf.space"
|