luanpoppe commited on
Commit
ec8caf1
·
2 Parent(s): bdf043b 9d69740

Merge branch 'feat-refatoracoes-gerais' of https://github.com/luanpoppe/vella-backend into feat-refatoracoes-gerais

Browse files
Files changed (1) hide show
  1. _utils/resumo_simples_cursor.py +0 -234
_utils/resumo_simples_cursor.py DELETED
@@ -1,234 +0,0 @@
1
- import os
2
- from typing import List, Dict, Tuple
3
- from setup.easy_imports import (
4
- HuggingFaceEmbeddings,
5
- PyPDFLoader,
6
- Chroma,
7
- ChatOpenAI,
8
- create_extraction_chain,
9
- PromptTemplate,
10
- RecursiveCharacterTextSplitter,
11
- )
12
- from dataclasses import dataclass
13
- import uuid
14
- import json
15
- from langchain_huggingface import HuggingFaceEndpoint
16
- from setup.environment import default_model
17
-
18
- os.environ["LANGCHAIN_TRACING_V2"] = "true"
19
- os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
20
- os.environ.get("LANGCHAIN_API_KEY")
21
- os.environ["LANGCHAIN_PROJECT"] = "VELLA"
22
-
23
-
24
- @dataclass
25
- class DocumentChunk:
26
- content: str
27
- page_number: int
28
- chunk_id: str
29
- start_char: int
30
- end_char: int
31
-
32
-
33
- class DocumentSummarizer:
34
-
35
- def __init__(
36
- self, openai_api_key: str, model, embedding, chunk_config, system_prompt
37
- ):
38
- self.model = model
39
- self.system_prompt = system_prompt
40
- self.openai_api_key = openai_api_key
41
- self.embeddings = HuggingFaceEmbeddings(model_name=embedding)
42
- self.text_splitter = RecursiveCharacterTextSplitter(
43
- chunk_size=chunk_config["size"], chunk_overlap=chunk_config["overlap"]
44
- )
45
- self.chunk_metadata = {} # Store chunk metadata for tracing
46
-
47
- def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
48
- """Load PDF and split into chunks with metadata"""
49
- loader = PyPDFLoader(pdf_path)
50
- pages = loader.load()
51
- chunks = []
52
- char_count = 0
53
-
54
- for page in pages:
55
- text = page.page_content
56
- # Split the page content
57
- page_chunks = self.text_splitter.split_text(text)
58
-
59
- for chunk in page_chunks:
60
- chunk_id = str(uuid.uuid4())
61
- start_char = text.find(chunk)
62
- end_char = start_char + len(chunk)
63
-
64
- doc_chunk = DocumentChunk(
65
- content=chunk,
66
- page_number=page.metadata.get("page") + 1, # 1-based page numbering
67
- chunk_id=chunk_id,
68
- start_char=char_count + start_char,
69
- end_char=char_count + end_char,
70
- )
71
- chunks.append(doc_chunk)
72
-
73
- # Store metadata for later retrieval
74
- self.chunk_metadata[chunk_id] = {
75
- "page": doc_chunk.page_number,
76
- "start_char": doc_chunk.start_char,
77
- "end_char": doc_chunk.end_char,
78
- }
79
-
80
- char_count += len(text)
81
-
82
- return chunks
83
-
84
- def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma:
85
- """Create vector store with metadata"""
86
- texts = [chunk.content for chunk in chunks]
87
- metadatas = [
88
- {
89
- "chunk_id": chunk.chunk_id,
90
- "page": chunk.page_number,
91
- "start_char": chunk.start_char,
92
- "end_char": chunk.end_char,
93
- }
94
- for chunk in chunks
95
- ]
96
-
97
- vector_store = Chroma.from_texts(
98
- texts=texts, metadatas=metadatas, embedding=self.embeddings
99
- )
100
- return vector_store
101
-
102
- def generate_summary_with_sources(
103
- self,
104
- vector_store: Chroma,
105
- query: str = "Summarize the main points of this document",
106
- ) -> List[Dict]:
107
- """Generate summary with source citations, returning structured JSON data"""
108
- # Retrieve relevant chunks with metadata
109
- relevant_docs = vector_store.similarity_search_with_score(query, k=5)
110
-
111
- # Prepare context and track sources
112
- contexts = []
113
- sources = []
114
-
115
- for doc, score in relevant_docs:
116
- chunk_id = doc.metadata["chunk_id"]
117
- context = doc.page_content
118
- contexts.append(context)
119
-
120
- sources.append(
121
- {
122
- "content": context,
123
- "page": doc.metadata["page"],
124
- "chunk_id": chunk_id,
125
- "relevance_score": score,
126
- }
127
- )
128
-
129
- prompt = PromptTemplate(
130
- template=self.system_prompt, input_variables=["context"]
131
- )
132
- llm = ""
133
-
134
- if self.model == default_model:
135
- llm = ChatOpenAI(
136
- temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key
137
- )
138
- else:
139
- llm = HuggingFaceEndpoint(
140
- repo_id=self.model,
141
- task="text-generation",
142
- max_new_tokens=1100,
143
- do_sample=False,
144
- huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN"),
145
- )
146
-
147
- response = llm.invoke(prompt.format(context="\n\n".join(contexts))).content
148
-
149
- # Split the response into paragraphs
150
- summaries = [p.strip() for p in response.split("\n\n") if p.strip()]
151
-
152
- # Create structured output
153
- structured_output = []
154
- for idx, summary in enumerate(summaries):
155
- # Associate each summary with the most relevant source
156
- structured_output.append(
157
- {
158
- "content": summary,
159
- "source": {
160
- "page": sources[min(idx, len(sources) - 1)]["page"],
161
- "text": sources[min(idx, len(sources) - 1)]["content"][:200]
162
- + "...",
163
- "relevance_score": sources[min(idx, len(sources) - 1)][
164
- "relevance_score"
165
- ],
166
- },
167
- }
168
- )
169
-
170
- return structured_output
171
-
172
- def get_source_context(self, chunk_id: str, window: int = 100) -> Dict:
173
- """Get extended context around a specific chunk"""
174
- metadata = self.chunk_metadata.get(chunk_id)
175
- if not metadata:
176
- return None
177
-
178
- return {
179
- "page": metadata["page"],
180
- "start_char": metadata["start_char"],
181
- "end_char": metadata["end_char"],
182
- }
183
-
184
-
185
- def get_llm_summary_answer_by_cursor(serializer, listaPDFs):
186
- # By Luan
187
- allPdfsChunks = []
188
-
189
- # Initialize summarizer
190
- summarizer = DocumentSummarizer(
191
- openai_api_key=os.environ.get("OPENAI_API_KEY"),
192
- embedding=serializer["hf_embedding"],
193
- chunk_config={
194
- "size": serializer["chunk_size"],
195
- "overlap": serializer["chunk_overlap"],
196
- },
197
- system_prompt=serializer["system_prompt"],
198
- model=serializer["model"],
199
- )
200
-
201
- # Load and process document
202
- for pdf in listaPDFs:
203
- pdf_path = pdf
204
- chunks = summarizer.load_and_split_document(pdf_path)
205
- allPdfsChunks = allPdfsChunks + chunks
206
-
207
- vector_store = summarizer.create_vector_store(allPdfsChunks)
208
-
209
- # Generate structured summary
210
- structured_summaries = summarizer.generate_summary_with_sources(vector_store)
211
-
212
- # Print or return the structured data
213
- # print(structured_summaries)
214
- json_data = json.dumps(structured_summaries)
215
- print("\n\n")
216
- print(json_data)
217
- return structured_summaries
218
- # If you need to send to frontend, you can just return structured_summaries
219
- # It will be in the format:
220
- # [
221
- # {
222
- # "content": "Summary point 1...",
223
- # "source": {
224
- # "page": 1,
225
- # "text": "Source text...",
226
- # "relevance_score": 0.95
227
- # }
228
- # },
229
- # ...
230
- # ]
231
-
232
-
233
- if __name__ == "__main__":
234
- get_llm_summary_answer_by_cursor()