luanpoppe commited on
Commit
9d69740
·
1 Parent(s): a263183

feat: removendo arquivos e pastas desnecessárias

Browse files
_antigos/__init__.py DELETED
File without changes
_antigos/pdfs/__init__.py DELETED
File without changes
_antigos/pdfs/admin.py DELETED
@@ -1,7 +0,0 @@
1
- from django.contrib import admin
2
-
3
- # from pdfs.models import PDFsModel
4
-
5
- # Register your models here.
6
-
7
- # admin.site.register(PDFsModel)
 
 
 
 
 
 
 
 
_antigos/pdfs/apps.py DELETED
@@ -1,6 +0,0 @@
1
- from django.apps import AppConfig
2
-
3
-
4
- class PdfsConfig(AppConfig):
5
- default_auto_field = "django.db.models.BigAutoField"
6
- name = "pdfs"
 
 
 
 
 
 
 
_antigos/pdfs/migrations/0001_initial.py DELETED
@@ -1,21 +0,0 @@
1
- # Generated by Django 4.1 on 2024-11-09 22:42
2
-
3
- from django.db import migrations, models
4
-
5
-
6
- class Migration(migrations.Migration):
7
-
8
- initial = True
9
-
10
- dependencies = [
11
- ]
12
-
13
- operations = [
14
- migrations.CreateModel(
15
- name='EndpointTesteModel',
16
- fields=[
17
- ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
18
- ('teste', models.CharField(max_length=300)),
19
- ],
20
- ),
21
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
_antigos/pdfs/migrations/0002_delete_endpointtestemodel.py DELETED
@@ -1,16 +0,0 @@
1
- # Generated by Django 4.1 on 2024-11-16 00:46
2
-
3
- from django.db import migrations
4
-
5
-
6
- class Migration(migrations.Migration):
7
-
8
- dependencies = [
9
- ('pdfs', '0001_initial'),
10
- ]
11
-
12
- operations = [
13
- migrations.DeleteModel(
14
- name='EndpointTesteModel',
15
- ),
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
_antigos/pdfs/migrations/__init__.py DELETED
File without changes
_antigos/pdfs/models.py DELETED
@@ -1,4 +0,0 @@
1
- from django.db import models
2
-
3
- # Create your models here.
4
- # class PDFsModel(models.Model):
 
 
 
 
 
_antigos/pdfs/serializer.py DELETED
@@ -1,8 +0,0 @@
1
- from rest_framework import serializers
2
-
3
- class PDFUploadSerializer(serializers.Serializer):
4
- files = serializers.ListField(child=serializers.FileField(), required=True)
5
- system_prompt = serializers.CharField(required=True)
6
- user_message = serializers.CharField(required=True)
7
- model = serializers.CharField(required=False)
8
- embedding = serializers.CharField(required=False)
 
 
 
 
 
 
 
 
 
_antigos/pdfs/tests.py DELETED
@@ -1,3 +0,0 @@
1
- from django.test import TestCase
2
-
3
- # Create your tests here.
 
 
 
 
_antigos/pdfs/views.py DELETED
@@ -1,52 +0,0 @@
1
- import tempfile, os
2
- from pdfs.serializer import PDFUploadSerializer
3
- from setup.environment import default_model
4
- from drf_spectacular.utils import extend_schema
5
-
6
- from rest_framework.decorators import api_view, parser_classes
7
- from rest_framework.parsers import MultiPartParser
8
- from rest_framework.response import Response
9
-
10
- from _utils.main import get_llm_answer
11
-
12
- @extend_schema(
13
- request=PDFUploadSerializer,
14
- )
15
- @api_view(["POST"])
16
- @parser_classes([MultiPartParser])
17
- def getPDF(request):
18
- if request.method == "POST":
19
- serializer = PDFUploadSerializer(data=request.data)
20
- if serializer.is_valid(raise_exception=True):
21
- listaPDFs = []
22
- print('\n\n')
23
- data = request.data
24
- print('data: ', data)
25
- embedding = serializer.validated_data.get("embedding", "gpt")
26
- model = serializer.validated_data.get("model", default_model)
27
-
28
- # pdf_file = serializer.validated_data['file']
29
- for file in serializer.validated_data['files']:
30
- print("file: ", file)
31
- file.seek(0)
32
- # Create a temporary file to save the uploaded PDF
33
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
34
- # Write the uploaded file content to the temporary file
35
- for chunk in file.chunks():
36
- temp_file.write(chunk)
37
- temp_file_path = temp_file.name # Get the path of the temporary file
38
- listaPDFs.append(temp_file_path)
39
- # print('temp_file_path: ', temp_file_path)
40
- print('listaPDFs: ', listaPDFs)
41
-
42
- resposta_llm = None
43
- # resposta_llm = get_llm_answer(data["system_prompt"], data["user_message"], temp_file_path, model=model, embedding=embedding)
44
- resposta_llm = get_llm_answer(data["system_prompt"], data["user_message"], listaPDFs, model=model, embedding=embedding)
45
-
46
- for file in listaPDFs:
47
- os.remove(file)
48
- # os.remove(temp_file_path)
49
-
50
- return Response({
51
- "Resposta": resposta_llm
52
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
_antigos/resumos/__init__.py DELETED
File without changes
_antigos/resumos/admin.py DELETED
@@ -1,3 +0,0 @@
1
- from django.contrib import admin
2
-
3
- # Register your models here.
 
 
 
 
_antigos/resumos/apps.py DELETED
@@ -1,6 +0,0 @@
1
- from django.apps import AppConfig
2
-
3
-
4
- class ResumosConfig(AppConfig):
5
- default_auto_field = 'django.db.models.BigAutoField'
6
- name = 'resumos'
 
 
 
 
 
 
 
_antigos/resumos/migrations/__init__.py DELETED
File without changes
_antigos/resumos/models.py DELETED
@@ -1,3 +0,0 @@
1
- from django.db import models
2
-
3
- # Create your models here.
 
 
 
 
_antigos/resumos/serializer.py DELETED
@@ -1,29 +0,0 @@
1
- from rest_framework import serializers
2
- from setup.environment import default_model
3
- # from _utils.utils import DEFAULT_SYSTEM_PROMPT
4
-
5
- prompt_template = """
6
- Based on the following context, provide multiple key points from the document.
7
- For each point, create a new paragraph.
8
- Each paragraph should be a complete, self-contained insight.
9
-
10
- Context: {context}
11
-
12
- Key points:
13
- """
14
-
15
- class ResumoPDFSerializer(serializers.Serializer):
16
- files = serializers.ListField(child=serializers.FileField(), required=True)
17
- system_prompt = serializers.CharField(required=False)
18
- user_message = serializers.CharField(required=False, default="")
19
- model = serializers.CharField(required=False)
20
- iterative_refinement = serializers.BooleanField(required=False, default=False) # type: ignore
21
-
22
- class ResumoCursorSerializer(serializers.Serializer):
23
- files = serializers.ListField(child=serializers.FileField(), required=True)
24
- system_prompt = serializers.CharField(required=False, default=prompt_template)
25
- user_message = serializers.CharField(required=False, default="")
26
- model = serializers.CharField(required=False, default=default_model)
27
- hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
28
- chunk_size = serializers.IntegerField(required=False, default=3500)
29
- chunk_overlap = serializers.IntegerField(required=False, default=800)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
_antigos/resumos/tests.py DELETED
@@ -1,3 +0,0 @@
1
- from django.test import TestCase
2
-
3
- # Create your tests here.
 
 
 
 
_antigos/resumos/views.py DELETED
@@ -1,144 +0,0 @@
1
- from rest_framework.views import APIView
2
- import tempfile, os
3
- from rest_framework.response import Response
4
- from _utils.resumo_simples_cursor import get_llm_summary_answer_by_cursor
5
- from _utils.utils import DEFAULT_SYSTEM_PROMPT
6
- from .serializer import (
7
- ResumoPDFSerializer,
8
- ResumoCursorSerializer,
9
- )
10
- from _utils.main import get_llm_answer_summary, get_llm_answer_summary_with_embedding
11
- from setup.environment import default_model
12
- from rest_framework.parsers import MultiPartParser
13
- from drf_spectacular.utils import extend_schema
14
-
15
-
16
- class ResumoView(APIView):
17
- parser_classes = [MultiPartParser]
18
-
19
- @extend_schema(
20
- request=ResumoPDFSerializer,
21
- )
22
- def post(self, request):
23
- serializer = ResumoPDFSerializer(data=request.data)
24
- if serializer.is_valid(raise_exception=True):
25
- listaPDFs = []
26
- data = serializer.validated_data
27
- model = serializer.validated_data.get("model", default_model)
28
- print("serializer.validated_data: ", serializer.validated_data)
29
-
30
- for file in serializer.validated_data["files"]:
31
- print("file: ", file)
32
- file.seek(0)
33
- with tempfile.NamedTemporaryFile(
34
- delete=False, suffix=".pdf"
35
- ) as temp_file: # Create a temporary file to save the uploaded PDF
36
- for (
37
- chunk
38
- ) in (
39
- file.chunks()
40
- ): # Write the uploaded file content to the temporary file
41
- temp_file.write(chunk)
42
- temp_file_path = (
43
- temp_file.name
44
- ) # Get the path of the temporary file
45
- listaPDFs.append(temp_file_path)
46
- # print('listaPDFs: ', listaPDFs)
47
-
48
- system_prompt = data.get("system_prompt", DEFAULT_SYSTEM_PROMPT)
49
- resposta_llm = get_llm_answer_summary(
50
- system_prompt,
51
- data["user_message"],
52
- listaPDFs,
53
- model=model,
54
- isIterativeRefinement=data["iterative_refinement"],
55
- )
56
-
57
- for file in listaPDFs:
58
- os.remove(file)
59
-
60
- return Response({"resposta": resposta_llm})
61
-
62
-
63
- class ResumoEmbeddingView(APIView):
64
- parser_classes = [MultiPartParser]
65
-
66
- @extend_schema(
67
- request=ResumoPDFSerializer,
68
- )
69
- def post(self, request):
70
- serializer = ResumoPDFSerializer(data=request.data)
71
- if serializer.is_valid(raise_exception=True):
72
- listaPDFs = []
73
- data = serializer.validated_data
74
- model = serializer.validated_data.get("model", default_model)
75
- print("serializer.validated_data: ", serializer.validated_data)
76
-
77
- for file in serializer.validated_data["files"]:
78
- file.seek(0)
79
- with tempfile.NamedTemporaryFile(
80
- delete=False, suffix=".pdf"
81
- ) as temp_file: # Create a temporary file to save the uploaded PDF
82
- for (
83
- chunk
84
- ) in (
85
- file.chunks()
86
- ): # Write the uploaded file content to the temporary file
87
- temp_file.write(chunk)
88
- temp_file_path = (
89
- temp_file.name
90
- ) # Get the path of the temporary file
91
- listaPDFs.append(temp_file_path)
92
- print("listaPDFs: ", listaPDFs)
93
-
94
- system_prompt = data.get("system_prompt", DEFAULT_SYSTEM_PROMPT)
95
- resposta_llm = get_llm_answer_summary_with_embedding(
96
- system_prompt,
97
- data["user_message"],
98
- listaPDFs,
99
- model=model,
100
- isIterativeRefinement=data["iterative_refinement"],
101
- )
102
-
103
- for file in listaPDFs:
104
- os.remove(file)
105
-
106
- return Response({"resposta": resposta_llm})
107
-
108
-
109
- class ResumoSimplesCursorView(APIView):
110
- parser_classes = [MultiPartParser]
111
-
112
- @extend_schema(
113
- request=ResumoCursorSerializer,
114
- )
115
- def post(self, request):
116
- serializer = ResumoCursorSerializer(data=request.data)
117
- if serializer.is_valid(raise_exception=True):
118
- listaPDFs = []
119
- data = serializer.validated_data
120
- print("\nserializer.validated_data: ", serializer.validated_data)
121
-
122
- for file in serializer.validated_data["files"]:
123
- file.seek(0)
124
- with tempfile.NamedTemporaryFile(
125
- delete=False, suffix=".pdf"
126
- ) as temp_file: # Create a temporary file to save the uploaded PDF
127
- for (
128
- chunk
129
- ) in (
130
- file.chunks()
131
- ): # Write the uploaded file content to the temporary file
132
- temp_file.write(chunk)
133
- temp_file_path = (
134
- temp_file.name
135
- ) # Get the path of the temporary file
136
- listaPDFs.append(temp_file_path)
137
- print("listaPDFs: ", listaPDFs)
138
-
139
- resposta_llm = get_llm_summary_answer_by_cursor(data, listaPDFs)
140
-
141
- for file in listaPDFs:
142
- os.remove(file)
143
-
144
- return Response({"resposta": resposta_llm})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
_utils/resumo_simples_cursor.py DELETED
@@ -1,234 +0,0 @@
1
- import os
2
- from typing import List, Dict, Tuple
3
- from setup.easy_imports import (
4
- HuggingFaceEmbeddings,
5
- PyPDFLoader,
6
- Chroma,
7
- ChatOpenAI,
8
- create_extraction_chain,
9
- PromptTemplate,
10
- RecursiveCharacterTextSplitter,
11
- )
12
- from dataclasses import dataclass
13
- import uuid
14
- import json
15
- from langchain_huggingface import HuggingFaceEndpoint
16
- from setup.environment import default_model
17
-
18
- os.environ["LANGCHAIN_TRACING_V2"] = "true"
19
- os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
20
- os.environ.get("LANGCHAIN_API_KEY")
21
- os.environ["LANGCHAIN_PROJECT"] = "VELLA"
22
-
23
-
24
- @dataclass
25
- class DocumentChunk:
26
- content: str
27
- page_number: int
28
- chunk_id: str
29
- start_char: int
30
- end_char: int
31
-
32
-
33
- class DocumentSummarizer:
34
-
35
- def __init__(
36
- self, openai_api_key: str, model, embedding, chunk_config, system_prompt
37
- ):
38
- self.model = model
39
- self.system_prompt = system_prompt
40
- self.openai_api_key = openai_api_key
41
- self.embeddings = HuggingFaceEmbeddings(model_name=embedding)
42
- self.text_splitter = RecursiveCharacterTextSplitter(
43
- chunk_size=chunk_config["size"], chunk_overlap=chunk_config["overlap"]
44
- )
45
- self.chunk_metadata = {} # Store chunk metadata for tracing
46
-
47
- def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
48
- """Load PDF and split into chunks with metadata"""
49
- loader = PyPDFLoader(pdf_path)
50
- pages = loader.load()
51
- chunks = []
52
- char_count = 0
53
-
54
- for page in pages:
55
- text = page.page_content
56
- # Split the page content
57
- page_chunks = self.text_splitter.split_text(text)
58
-
59
- for chunk in page_chunks:
60
- chunk_id = str(uuid.uuid4())
61
- start_char = text.find(chunk)
62
- end_char = start_char + len(chunk)
63
-
64
- doc_chunk = DocumentChunk(
65
- content=chunk,
66
- page_number=page.metadata.get("page") + 1, # 1-based page numbering
67
- chunk_id=chunk_id,
68
- start_char=char_count + start_char,
69
- end_char=char_count + end_char,
70
- )
71
- chunks.append(doc_chunk)
72
-
73
- # Store metadata for later retrieval
74
- self.chunk_metadata[chunk_id] = {
75
- "page": doc_chunk.page_number,
76
- "start_char": doc_chunk.start_char,
77
- "end_char": doc_chunk.end_char,
78
- }
79
-
80
- char_count += len(text)
81
-
82
- return chunks
83
-
84
- def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma:
85
- """Create vector store with metadata"""
86
- texts = [chunk.content for chunk in chunks]
87
- metadatas = [
88
- {
89
- "chunk_id": chunk.chunk_id,
90
- "page": chunk.page_number,
91
- "start_char": chunk.start_char,
92
- "end_char": chunk.end_char,
93
- }
94
- for chunk in chunks
95
- ]
96
-
97
- vector_store = Chroma.from_texts(
98
- texts=texts, metadatas=metadatas, embedding=self.embeddings
99
- )
100
- return vector_store
101
-
102
- def generate_summary_with_sources(
103
- self,
104
- vector_store: Chroma,
105
- query: str = "Summarize the main points of this document",
106
- ) -> List[Dict]:
107
- """Generate summary with source citations, returning structured JSON data"""
108
- # Retrieve relevant chunks with metadata
109
- relevant_docs = vector_store.similarity_search_with_score(query, k=5)
110
-
111
- # Prepare context and track sources
112
- contexts = []
113
- sources = []
114
-
115
- for doc, score in relevant_docs:
116
- chunk_id = doc.metadata["chunk_id"]
117
- context = doc.page_content
118
- contexts.append(context)
119
-
120
- sources.append(
121
- {
122
- "content": context,
123
- "page": doc.metadata["page"],
124
- "chunk_id": chunk_id,
125
- "relevance_score": score,
126
- }
127
- )
128
-
129
- prompt = PromptTemplate(
130
- template=self.system_prompt, input_variables=["context"]
131
- )
132
- llm = ""
133
-
134
- if self.model == default_model:
135
- llm = ChatOpenAI(
136
- temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key
137
- )
138
- else:
139
- llm = HuggingFaceEndpoint(
140
- repo_id=self.model,
141
- task="text-generation",
142
- max_new_tokens=1100,
143
- do_sample=False,
144
- huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN"),
145
- )
146
-
147
- response = llm.invoke(prompt.format(context="\n\n".join(contexts))).content
148
-
149
- # Split the response into paragraphs
150
- summaries = [p.strip() for p in response.split("\n\n") if p.strip()]
151
-
152
- # Create structured output
153
- structured_output = []
154
- for idx, summary in enumerate(summaries):
155
- # Associate each summary with the most relevant source
156
- structured_output.append(
157
- {
158
- "content": summary,
159
- "source": {
160
- "page": sources[min(idx, len(sources) - 1)]["page"],
161
- "text": sources[min(idx, len(sources) - 1)]["content"][:200]
162
- + "...",
163
- "relevance_score": sources[min(idx, len(sources) - 1)][
164
- "relevance_score"
165
- ],
166
- },
167
- }
168
- )
169
-
170
- return structured_output
171
-
172
- def get_source_context(self, chunk_id: str, window: int = 100) -> Dict:
173
- """Get extended context around a specific chunk"""
174
- metadata = self.chunk_metadata.get(chunk_id)
175
- if not metadata:
176
- return None
177
-
178
- return {
179
- "page": metadata["page"],
180
- "start_char": metadata["start_char"],
181
- "end_char": metadata["end_char"],
182
- }
183
-
184
-
185
- def get_llm_summary_answer_by_cursor(serializer, listaPDFs):
186
- # By Luan
187
- allPdfsChunks = []
188
-
189
- # Initialize summarizer
190
- summarizer = DocumentSummarizer(
191
- openai_api_key=os.environ.get("OPENAI_API_KEY"),
192
- embedding=serializer["hf_embedding"],
193
- chunk_config={
194
- "size": serializer["chunk_size"],
195
- "overlap": serializer["chunk_overlap"],
196
- },
197
- system_prompt=serializer["system_prompt"],
198
- model=serializer["model"],
199
- )
200
-
201
- # Load and process document
202
- for pdf in listaPDFs:
203
- pdf_path = pdf
204
- chunks = summarizer.load_and_split_document(pdf_path)
205
- allPdfsChunks = allPdfsChunks + chunks
206
-
207
- vector_store = summarizer.create_vector_store(allPdfsChunks)
208
-
209
- # Generate structured summary
210
- structured_summaries = summarizer.generate_summary_with_sources(vector_store)
211
-
212
- # Print or return the structured data
213
- # print(structured_summaries)
214
- json_data = json.dumps(structured_summaries)
215
- print("\n\n")
216
- print(json_data)
217
- return structured_summaries
218
- # If you need to send to frontend, you can just return structured_summaries
219
- # It will be in the format:
220
- # [
221
- # {
222
- # "content": "Summary point 1...",
223
- # "source": {
224
- # "page": 1,
225
- # "text": "Source text...",
226
- # "relevance_score": 0.95
227
- # }
228
- # },
229
- # ...
230
- # ]
231
-
232
-
233
- if __name__ == "__main__":
234
- get_llm_summary_answer_by_cursor()