luanpoppe commited on
Commit
5fde427
·
2 Parent(s): f9a1a18 e07dc1f

Merge branch 'tests' of https://github.com/luanpoppe/vella-backend into feat-refatoracoes-gerais

Browse files
.env.example CHANGED
@@ -11,4 +11,6 @@ LLAMA_CLOUD_API_KEY_PEIXE=""
11
  DEEPSEEKK_API_KEY=""
12
  GOOGLE_API_KEY_PEIXE=""
13
  SENTRY_DSN=""
14
- AMBIENTE="testes"
 
 
 
11
  DEEPSEEKK_API_KEY=""
12
  GOOGLE_API_KEY_PEIXE=""
13
  SENTRY_DSN=""
14
+ AMBIENTE="testes"
15
+ GOOGLE_APPLICATION_CREDENTIALS="" # Só é necessário em ambiente de desenvolvimento que não esteja usando docker
16
+ GCP_CREDENTIALS_JSON_CONTENT="Conteúdo inteiro do arquivo vella_gcp_luan_credentials.json" # Em produção, tem que conter todo o conteúdo do arquivo de credentials. Localmente, não precisa existir
Dockerfile CHANGED
@@ -3,6 +3,10 @@ FROM python:3.12
3
  # Instalação necessária para converter arquivos .doc
4
  RUN apt-get update && apt-get install -y antiword
5
 
 
 
 
 
6
  RUN useradd -m -u 1000 user
7
  USER user
8
  ENV PATH="/home/user/.local/bin:$PATH"
@@ -23,6 +27,10 @@ RUN pip install --no-cache-dir -r requirements.txt
23
  RUN python manage.py collectstatic --noinput
24
 
25
  RUN pip install uvicorn
 
 
 
 
26
  CMD ["uvicorn", "setup.asgi:application", "--host", "0.0.0.0", "--port", "7860"]
27
 
28
  # ENTRYPOINT ["python", "manage.py", "runserver"]
 
3
  # Instalação necessária para converter arquivos .doc
4
  RUN apt-get update && apt-get install -y antiword
5
 
6
+ # Copy the entrypoint script and make it executable
7
+ COPY entrypoint.sh /entrypoint.sh
8
+ RUN chmod +x /entrypoint.sh
9
+
10
  RUN useradd -m -u 1000 user
11
  USER user
12
  ENV PATH="/home/user/.local/bin:$PATH"
 
27
  RUN python manage.py collectstatic --noinput
28
 
29
  RUN pip install uvicorn
30
+
31
+ # Set the entrypoint to our script
32
+ ENTRYPOINT ["/entrypoint.sh"]
33
+
34
  CMD ["uvicorn", "setup.asgi:application", "--host", "0.0.0.0", "--port", "7860"]
35
 
36
  # ENTRYPOINT ["python", "manage.py", "runserver"]
_utils/gerar_documento_utils/GerarDocumento.py CHANGED
@@ -13,7 +13,7 @@ from _utils.gerar_documento_utils.prompts import (
13
  prompt_para_gerar_titulo,
14
  )
15
  from _utils.langchain_utils.Chain_class import Chain
16
- from _utils.langchain_utils.LLM_class import LLM
17
  from _utils.langchain_utils.Prompt_class import Prompt
18
  from _utils.langchain_utils.Vector_store_class import VectorStore
19
  from _utils.utils import convert_markdown_to_HTML
 
13
  prompt_para_gerar_titulo,
14
  )
15
  from _utils.langchain_utils.Chain_class import Chain
16
+ from _utils.langchain_utils.LLM_class import LLM, Google_llms
17
  from _utils.langchain_utils.Prompt_class import Prompt
18
  from _utils.langchain_utils.Vector_store_class import VectorStore
19
  from _utils.utils import convert_markdown_to_HTML
_utils/gerar_documento_utils/prompts.py CHANGED
@@ -1,4 +1,14 @@
1
- def create_prompt_auxiliar_do_contextual_prompt(PROCESSO_JURIDICO: str):
 
 
 
 
 
 
 
 
 
 
2
  return f"""
3
  <prompt>
4
  <persona>
@@ -46,10 +56,7 @@ Seu objetivo é analisar o processo jurídico fornecido e gerar um relatório co
46
  <instrucoes>
47
  Siga estritamente os passos abaixo:
48
 
49
- 1. **Análise Completa:** Leia e analise todo o conteúdo do processo fornecido.
50
- <processo_juridico>
51
- {PROCESSO_JURIDICO}
52
- </processo_juridico>
53
 
54
  2. **Identificação e Listagem de Peças:** Identifique quais das peças listadas na `<tarefa>` estão presentes no texto. Liste **apenas** as encontradas na tag `<pecas_identificadas>`.
55
 
 
1
+ def create_prompt_auxiliar_do_contextual_prompt(PROCESSO_JURIDICO: str | None = None):
2
+ if PROCESSO_JURIDICO:
3
+ adicionar_ao_prompt = f"""
4
+ 1. **Análise Completa:** Leia e analise todo o conteúdo do processo fornecido.
5
+ <processo_juridico>
6
+ {PROCESSO_JURIDICO}
7
+ </processo_juridico>"""
8
+ else:
9
+ adicionar_ao_prompt = """
10
+ 1. **Análise Completa:** Leia e analise todo o conteúdo do processo fornecido como PDF."""
11
+
12
  return f"""
13
  <prompt>
14
  <persona>
 
56
  <instrucoes>
57
  Siga estritamente os passos abaixo:
58
 
59
+ {adicionar_ao_prompt}
 
 
 
60
 
61
  2. **Identificação e Listagem de Peças:** Identifique quais das peças listadas na `<tarefa>` estão presentes no texto. Liste **apenas** as encontradas na tag `<pecas_identificadas>`.
62
 
_utils/google_integration/google_cloud.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from google.cloud import storage
3
+
4
+ GCP_PROJECT = "gen-lang-client-0350149082"
5
+ GCP_REGION = "us-central1"
6
+ DOCUMENT_API_ID = "b34a20d22dee16bb"
7
+ GCS_BUCKET_NAME = "vella-pdfs"
8
+
9
+
10
+ def upload_to_gcs(LOCAL_PDF_PATH: str) -> str:
11
+
12
+ # Path in GCS
13
+ GCS_DESTINATION_BLOB_NAME = "gemini_uploads/" + os.path.basename(LOCAL_PDF_PATH)
14
+
15
+ """Uploads a file to a GCS bucket and returns its URI."""
16
+ storage_client = storage.Client(
17
+ project=GCP_PROJECT,
18
+ )
19
+ bucket = storage_client.bucket(GCS_BUCKET_NAME)
20
+ blob = bucket.blob(GCS_DESTINATION_BLOB_NAME)
21
+
22
+ print(
23
+ f"Uploading {LOCAL_PDF_PATH} to gs://{GCS_BUCKET_NAME}/{GCS_DESTINATION_BLOB_NAME}..."
24
+ )
25
+ blob.upload_from_filename(LOCAL_PDF_PATH)
26
+ gcs_uri = f"gs://{GCS_BUCKET_NAME}/{GCS_DESTINATION_BLOB_NAME}"
27
+ print(f"File uploaded to {gcs_uri}")
28
+ return gcs_uri
_utils/langchain_utils/LLM_class.py CHANGED
@@ -1,9 +1,10 @@
1
- from typing import Literal, cast
2
  from pydantic import SecretStr
3
- from setup.environment import default_model
4
  from setup.easy_imports import ChatOpenAI, ChatGoogleGenerativeAI
5
  import os
6
  from langchain_core.messages import HumanMessage
 
7
 
8
  deepseek_api_key = cast(str, os.environ.get("DEEPSEEKK_API_KEY"))
9
  google_api_key = cast(str, os.environ.get("GOOGLE_API_KEY_PEIXE"))
@@ -75,3 +76,44 @@ class LLM:
75
  raise Exception(
76
  "Failed to generate the final document after 5 retries and the fallback attempt with chat-gpt-4o-mini."
77
  ) from e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Literal, cast
2
  from pydantic import SecretStr
3
+ from _utils.google_integration.google_cloud import GCP_PROJECT, upload_to_gcs
4
  from setup.easy_imports import ChatOpenAI, ChatGoogleGenerativeAI
5
  import os
6
  from langchain_core.messages import HumanMessage
7
+ from langchain_google_vertexai import ChatVertexAI
8
 
9
  deepseek_api_key = cast(str, os.environ.get("DEEPSEEKK_API_KEY"))
10
  google_api_key = cast(str, os.environ.get("GOOGLE_API_KEY_PEIXE"))
 
76
  raise Exception(
77
  "Failed to generate the final document after 5 retries and the fallback attempt with chat-gpt-4o-mini."
78
  ) from e
79
+
80
+ async def google_gemini_vertex_ainvoke(
81
+ self,
82
+ prompt: str,
83
+ list_of_pdfs: List[str],
84
+ model: Google_llms = "gemini-2.5-flash-preview-04-17",
85
+ max_retries: int = 3,
86
+ ) -> str | None:
87
+ message_parts = [
88
+ {"type": "text", "text": prompt},
89
+ ]
90
+ for pdf in list_of_pdfs:
91
+ pdf_gcs_uri = upload_to_gcs(pdf)
92
+ message_parts.append(
93
+ {
94
+ # This structure is used for file references via URI
95
+ "type": "media",
96
+ "mime_type": "application/pdf", # <-- mime_type moved up
97
+ "file_uri": pdf_gcs_uri, # <-- file_uri moved up
98
+ }
99
+ )
100
+
101
+ for attempt in range(max_retries):
102
+ try:
103
+ llm = ChatVertexAI(
104
+ model_name=model,
105
+ project=GCP_PROJECT,
106
+ location="us-central1",
107
+ temperature=0,
108
+ )
109
+ response = await llm.ainvoke(
110
+ [HumanMessage(content=message_parts)] # type: ignore
111
+ )
112
+
113
+ if isinstance(response.content, list):
114
+ response.content = "\n".join(response.content) # type: ignore
115
+
116
+ return response.content # type: ignore
117
+ except Exception as e:
118
+ model = "gemini-2.0-flash"
119
+ print(f"Attempt {attempt + 1} failed with error: {e}")
_utils/langchain_utils/Splitter_class.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
2
  from _utils.handle_files import return_document_list_with_llama_parser
3
  from _utils.langchain_utils.splitter_util import (
@@ -18,6 +20,16 @@ from _utils.models.gerar_documento import (
18
  DocumentChunk,
19
  )
20
  import uuid
 
 
 
 
 
 
 
 
 
 
21
 
22
 
23
  class Splitter:
@@ -34,7 +46,10 @@ class Splitter:
34
  self.chunk_metadata = {} # Store chunk metadata for tracing
35
 
36
  async def load_and_split_document(
37
- self, pdf_path: str, should_use_llama_parse: bool, isBubble: bool
 
 
 
38
  ):
39
  """Load PDF and split into chunks with metadata"""
40
  # loader = PyPDFLoader(pdf_path)
@@ -144,6 +159,11 @@ class Splitter:
144
  # char_count += len(text)
145
  print("TERMINOU DE ORGANIZAR PDFS EM CHUNKS")
146
 
 
 
 
 
 
147
  return chunks, chunks_of_string_only
148
 
149
  def load_and_split_text(self, text: str) -> List[DocumentChunk]:
@@ -185,3 +205,132 @@ class Splitter:
185
  char_count += len(text)
186
 
187
  return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
  from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
4
  from _utils.handle_files import return_document_list_with_llama_parser
5
  from _utils.langchain_utils.splitter_util import (
 
20
  DocumentChunk,
21
  )
22
  import uuid
23
+ import json
24
+ from _utils.google_integration.google_cloud import (
25
+ DOCUMENT_API_ID,
26
+ GCP_PROJECT,
27
+ GCP_REGION,
28
+ GCS_BUCKET_NAME,
29
+ upload_to_gcs,
30
+ )
31
+ from google.cloud import documentai
32
+ from google.cloud import storage
33
 
34
 
35
  class Splitter:
 
46
  self.chunk_metadata = {} # Store chunk metadata for tracing
47
 
48
  async def load_and_split_document(
49
+ self,
50
+ pdf_path: str,
51
+ should_use_llama_parse: bool,
52
+ isBubble: bool,
53
  ):
54
  """Load PDF and split into chunks with metadata"""
55
  # loader = PyPDFLoader(pdf_path)
 
159
  # char_count += len(text)
160
  print("TERMINOU DE ORGANIZAR PDFS EM CHUNKS")
161
 
162
+ if len(pages) == 0 or len(chunks) == 0:
163
+ text = await self.getOCRFromGoogleDocumentAPI(pdf_path)
164
+ chunks = self.load_and_split_text(text) # type: ignore
165
+ chunks_of_string_only = [chunk.content for chunk in chunks]
166
+
167
  return chunks, chunks_of_string_only
168
 
169
  def load_and_split_text(self, text: str) -> List[DocumentChunk]:
 
205
  char_count += len(text)
206
 
207
  return chunks
208
+
209
+ async def getOCRFromGoogleDocumentAPI(self, pdf_path: str):
210
+
211
+ pdf_gcs_uri = upload_to_gcs(pdf_path)
212
+
213
+ GCS_OUTPUT_PREFIX = "documentai_output/"
214
+ # GCS_INPUT_URI = f"gs://{GCS_BUCKET_NAME}/{f"gemini_uploads/{pdf_gcs_uri}"}"
215
+ GCS_INPUT_URI = pdf_gcs_uri
216
+ GCS_OUTPUT_URI = f"gs://{GCS_BUCKET_NAME}/{GCS_OUTPUT_PREFIX}"
217
+
218
+ docai_client = documentai.DocumentProcessorServiceClient()
219
+
220
+ processor_name = docai_client.processor_path(
221
+ project=GCP_PROJECT, location="us", processor=DOCUMENT_API_ID
222
+ )
223
+
224
+ gcs_document = documentai.GcsDocument(
225
+ gcs_uri=GCS_INPUT_URI,
226
+ mime_type="application/pdf", # Mime type is specified here for GcsDocument
227
+ )
228
+
229
+ gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
230
+
231
+ # 3. Create the BatchDocumentsInputConfig
232
+ input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
233
+ # Note: If GCS_INPUT_URI was a prefix for multiple files, you'd use GcsPrefix:
234
+ # gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=GCS_INPUT_URI_PREFIX)
235
+ # input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix, mime_type="application/pdf")
236
+
237
+ # 4. Create the DocumentOutputConfig
238
+ # GCS_OUTPUT_URI should be a gs:// URI prefix where the output JSONs will be stored
239
+ output_config = documentai.DocumentOutputConfig(
240
+ gcs_output_config=documentai.DocumentOutputConfig.GcsOutputConfig(
241
+ gcs_uri=GCS_OUTPUT_URI
242
+ )
243
+ )
244
+
245
+ # 5. Construct the BatchProcessRequest
246
+ request = documentai.BatchProcessRequest(
247
+ name=processor_name,
248
+ input_documents=input_config, # Use 'input_documents'
249
+ document_output_config=output_config, # Use 'document_output_config'
250
+ )
251
+
252
+ # Submit the batch process request (this is a long-running operation)
253
+ operation = docai_client.batch_process_documents(request)
254
+
255
+ print("Batch processing operation started. Waiting for completion...")
256
+ while not operation.done():
257
+ time.sleep(15) # Wait for 30 seconds before checking again
258
+ print("Waiting...")
259
+
260
+ print("Batch processing operation finished.")
261
+
262
+ # --- Download the results from GCS ---
263
+ storage_client = storage.Client(
264
+ project=GCP_PROJECT
265
+ ) # Uses GOOGLE_APPLICATION_CREDENTIALS/ADC
266
+ bucket = storage_client.bucket(GCS_BUCKET_NAME)
267
+
268
+ output_blobs = storage_client.list_blobs(
269
+ GCS_BUCKET_NAME, prefix=GCS_OUTPUT_PREFIX
270
+ )
271
+
272
+ downloaded_files_texts = []
273
+ try:
274
+ for blob in output_blobs:
275
+ # Document AI adds suffixes and subdirectories. Look for the actual JSON output files.
276
+ # The exact naming depends on the processor and options. Common pattern is ending with .json
277
+ if blob.name.endswith(".json"):
278
+ local_download_path = os.path.basename(
279
+ blob.name
280
+ ) # Download to current directory with blob name
281
+ print(f"Downloading {blob.name} to {local_download_path}...")
282
+ blob.download_to_filename(local_download_path)
283
+
284
+ with open(local_download_path, "r", encoding="utf-8") as f:
285
+ document_data = json.load(f)
286
+
287
+ # The top-level 'text' field contains the concatenated plain text.
288
+ if "text" in document_data and document_data["text"] is not None:
289
+ raw_text = document_data["text"]
290
+ print(f"\n--- Raw Text Extracted from {blob.name} ---")
291
+ # Print only a snippet or process as needed
292
+ print(
293
+ raw_text[:1000] + "..."
294
+ if len(raw_text) > 1000
295
+ else raw_text
296
+ )
297
+ print("--------------------------------------------")
298
+
299
+ return raw_text
300
+
301
+ # Optional: Store the text. If you processed a batch of files,
302
+ # you might want to associate the text with the original file name.
303
+ # Document AI metadata might link output JSONs back to input files.
304
+ # For simplicity here, let's just show the extraction.
305
+ # If you know it was a single input PDF, this is all the text.
306
+ # If it was multiple, you'd need a mapping or process each JSON.
307
+
308
+ else:
309
+ print(
310
+ f"Warning: 'text' field not found in {blob.name} or is empty."
311
+ )
312
+
313
+ # Optional: Read and print a snippet of the JSON content
314
+ # with open(local_download_path, 'r', encoding='utf-8') as f:
315
+ # data = json.load(f)
316
+ # # Print some extracted text, for example (structure varies by processor)
317
+ # if 'text' in data:
318
+ # print(f"Extracted text snippet: {data['text'][:500]}...") # Print first 500 chars
319
+ # elif 'entities' in data:
320
+ # print(f"Number of entities found: {len(data['entities'])}")
321
+ # else:
322
+ # print("Output JSON structure not immediately recognizable.")
323
+ # break # Uncomment if you only expect/need to process the first output file
324
+
325
+ if len(downloaded_files_texts) == 0 or not downloaded_files_texts:
326
+ print("No JSON output files found in the specified output location.")
327
+
328
+ except Exception as e:
329
+ print(f"Error listing or downloading output files: {e}")
330
+
331
+ print("\nProcess complete.")
332
+ if downloaded_files_texts:
333
+ print(f"Downloaded output file(s): {', '.join(downloaded_files_texts)}")
334
+ print("These files contain the OCR results in JSON format.")
335
+ else:
336
+ print("No output files were successfully downloaded.")
_utils/langchain_utils/Vector_store_class.py CHANGED
@@ -22,6 +22,8 @@ class VectorStore:
22
  axiom_instance: Axiom,
23
  ) -> Tuple[Chroma, BM25Okapi, List[str]]:
24
  """Create vector store and BM25 index with contextualized chunks"""
 
 
25
  try:
26
  # Prepare texts with context
27
  if is_contextualized_chunk:
@@ -69,5 +71,9 @@ class VectorStore:
69
  return vector_store, bm25, chunk_ids
70
 
71
  except Exception as e:
 
 
 
 
72
  self.logger.error(f"Error creating enhanced vector store: {str(e)}")
73
- raise Exception(f"Error creating enhanced vector store: {str(e)}")
 
22
  axiom_instance: Axiom,
23
  ) -> Tuple[Chroma, BM25Okapi, List[str]]:
24
  """Create vector store and BM25 index with contextualized chunks"""
25
+ contador_erro = 0
26
+
27
  try:
28
  # Prepare texts with context
29
  if is_contextualized_chunk:
 
71
  return vector_store, bm25, chunk_ids
72
 
73
  except Exception as e:
74
+ contador_erro += 1
75
+ if contador_erro >= 2:
76
+ raise Exception(f"Error creating enhanced vector store: {str(e)}")
77
+
78
  self.logger.error(f"Error creating enhanced vector store: {str(e)}")
79
+ return self.create_enhanced_vector_store(chunks, False, axiom_instance)
entrypoint.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+ set -e # Exit immediately if a command exits with a non-zero status.
3
+
4
+ CREDENTIALS_FILE_PATH="/app/vella_gcp_luan_credentials.json"
5
+
6
+ # Check if the GCP_CREDENTIALS_JSON_CONTENT secret is provided
7
+ if [ -n "$GCP_CREDENTIALS_JSON_CONTENT" ]; then
8
+ echo "GCP_CREDENTIALS_JSON_CONTENT secret found. Writing to $CREDENTIALS_FILE_PATH"
9
+ # Use printf to preserve newlines and special characters correctly
10
+ printf "%s" "$GCP_CREDENTIALS_JSON_CONTENT" > "$CREDENTIALS_FILE_PATH"
11
+ export GOOGLE_APPLICATION_CREDENTIALS="$CREDENTIALS_FILE_PATH"
12
+ echo "GOOGLE_APPLICATION_CREDENTIALS set to $CREDENTIALS_FILE_PATH"
13
+
14
+ # Optional: Add a check to see if the file looks like JSON (basic check)
15
+ if command -v jq >/dev/null && jq -e . "$CREDENTIALS_FILE_PATH" >/dev/null 2>&1; then
16
+ echo "Credentials file appears to be valid JSON."
17
+ else
18
+ echo "Warning: Credentials file may not be valid JSON. Content:"
19
+ # cat "$CREDENTIALS_FILE_PATH" # Print the content for debugging
20
+ fi
21
+
22
+ else
23
+ echo "Warning: GCP_CREDENTIALS_JSON_CONTENT secret not found. GCP services might not authenticate."
24
+ fi
25
+
26
+ exec "$@"
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ