Spaces:
Running
Running
luanpoppe
commited on
Commit
·
d32424b
1
Parent(s):
eebeb78
feat: adicionando e melhorando utilitários do langchain
Browse files- _utils/langchain_utils/Chain_class.py +20 -8
- _utils/langchain_utils/Document_class.py +19 -0
- _utils/langchain_utils/LLM_class.py +3 -2
- _utils/langchain_utils/Prompt_class.py +6 -0
- _utils/langchain_utils/Splitter_class.py +28 -0
- _utils/langchain_utils/embeddings.py +13 -0
- _utils/langchain_utils/retriever.py +15 -0
- _utils/langchain_utils/vector_stores.py +10 -0
- setup/easy_imports.py +4 -1
- setup/tokens.py +11 -0
_utils/langchain_utils/Chain_class.py
CHANGED
@@ -1,11 +1,23 @@
|
|
|
|
|
|
1 |
class Chain:
|
2 |
-
def __init__(self, prompt, model):
|
3 |
-
self.prompt = prompt
|
4 |
-
self.model = model
|
5 |
|
6 |
-
def create_prompt_model_chain(self):
|
7 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
|
10 |
-
chain = self.create_prompt_model_chain()
|
11 |
-
return chain.invoke(invoke_params)
|
|
|
1 |
+
from setup.easy_imports import RunnablePassthrough, create_retrieval_chain
|
2 |
+
|
3 |
class Chain:
|
|
|
|
|
|
|
4 |
|
5 |
+
def create_prompt_model_chain(self, prompt, model):
|
6 |
+
return prompt | model
|
7 |
+
|
8 |
+
def create_prompt_model_retriever_chain(self, prompt, model, retriever):
|
9 |
+
chain = prompt | model
|
10 |
+
return create_retrieval_chain(retriever, chain)
|
11 |
+
|
12 |
+
def invoke_retrieval_chain(self, chain, busca_no_vetor):
|
13 |
+
resposta = chain.invoke({"input": busca_no_vetor})
|
14 |
+
|
15 |
+
class Resposta:
|
16 |
+
def __init__(self, resposta):
|
17 |
+
self.final_answer = resposta["answer"].content
|
18 |
+
self.complete_obj = resposta
|
19 |
+
|
20 |
+
return Resposta(resposta)
|
21 |
+
|
22 |
|
23 |
+
chain = Chain()
|
|
|
|
_utils/langchain_utils/Document_class.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setup.easy_imports import PyPDFLoader
|
2 |
+
|
3 |
+
|
4 |
+
class Document_Class:
|
5 |
+
def load_pdf(self, pdf, ocr=False):
|
6 |
+
return PyPDFLoader(pdf, extract_images=ocr).load()
|
7 |
+
|
8 |
+
def load_and_split_pdf(self, pdf, ocr=False):
|
9 |
+
return PyPDFLoader(pdf, extract_images=ocr).load_and_split()
|
10 |
+
|
11 |
+
def get_pdf_text(self, pdf, ocr=False):
|
12 |
+
document = self.load_pdf(pdf, ocr)
|
13 |
+
texto = ""
|
14 |
+
for x in document:
|
15 |
+
texto += x.page_content
|
16 |
+
return texto
|
17 |
+
|
18 |
+
|
19 |
+
document = Document_Class()
|
_utils/langchain_utils/LLM_class.py
CHANGED
@@ -6,14 +6,15 @@ import os
|
|
6 |
|
7 |
deepseek_api_key = cast(str, os.environ.get("DEEPSEEKK_API_KEY"))
|
8 |
google_api_key = cast(str, os.environ.get("GOOGLE_API_KEY_PEIXE"))
|
|
|
9 |
|
10 |
|
11 |
class LLM:
|
12 |
def __init__(self):
|
13 |
pass
|
14 |
|
15 |
-
|
16 |
-
|
17 |
|
18 |
def deepseek(self, model="deepseek-chat"):
|
19 |
return ChatOpenAI(
|
|
|
6 |
|
7 |
deepseek_api_key = cast(str, os.environ.get("DEEPSEEKK_API_KEY"))
|
8 |
google_api_key = cast(str, os.environ.get("GOOGLE_API_KEY_PEIXE"))
|
9 |
+
open_ai_token = cast(str, os.environ.get("OPENAI_API_KEY"))
|
10 |
|
11 |
|
12 |
class LLM:
|
13 |
def __init__(self):
|
14 |
pass
|
15 |
|
16 |
+
def open_ai(self, model="gpt-4o-mini"):
|
17 |
+
return ChatOpenAI(api_key=SecretStr(open_ai_token), model=model)
|
18 |
|
19 |
def deepseek(self, model="deepseek-chat"):
|
20 |
return ChatOpenAI(
|
_utils/langchain_utils/Prompt_class.py
CHANGED
@@ -11,4 +11,10 @@ class Prompt:
|
|
11 |
)
|
12 |
return prompt_template
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
prompt = Prompt()
|
|
|
11 |
)
|
12 |
return prompt_template
|
13 |
|
14 |
+
def create_and_invoke_prompt(self, user_prompt, system_prompt="", dynamic_dict={}):
|
15 |
+
return ChatPromptTemplate.from_messages(
|
16 |
+
[("system", system_prompt), ("user", user_prompt)]
|
17 |
+
).invoke(dynamic_dict)
|
18 |
+
|
19 |
+
|
20 |
prompt = Prompt()
|
_utils/langchain_utils/Splitter_class.py
CHANGED
@@ -170,3 +170,31 @@ class Splitter:
|
|
170 |
char_count += len(text)
|
171 |
|
172 |
return chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
char_count += len(text)
|
171 |
|
172 |
return chunks
|
173 |
+
|
174 |
+
|
175 |
+
class Splitter_Simple:
|
176 |
+
def __init__(
|
177 |
+
self,
|
178 |
+
chunk_size=1000,
|
179 |
+
chunk_overlap=400,
|
180 |
+
):
|
181 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
182 |
+
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
183 |
+
)
|
184 |
+
|
185 |
+
async def load_and_split_document(self, pdf_path: str):
|
186 |
+
"""Load PDF and split into chunks with metadata"""
|
187 |
+
print("\nCOMEÇANDO LEITURA DO PDF")
|
188 |
+
pages = PyPDFLoader(pdf_path).load_and_split(self.text_splitter)
|
189 |
+
print("\nTERMINADO LEITURA DO PDF")
|
190 |
+
|
191 |
+
return pages
|
192 |
+
|
193 |
+
def load_and_split_text(self, text: str) -> List[Document]:
|
194 |
+
documents: List[Document] = []
|
195 |
+
chunks = self.text_splitter.split_text(text)
|
196 |
+
|
197 |
+
for chunk in chunks:
|
198 |
+
documents.append(Document(page_content=chunk))
|
199 |
+
|
200 |
+
return documents
|
_utils/langchain_utils/embeddings.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from pydantic import Secret
|
4 |
+
from setup.easy_imports import OpenAIEmbeddings
|
5 |
+
from setup.tokens import openai_api_key
|
6 |
+
|
7 |
+
|
8 |
+
class EmbeddingClass:
|
9 |
+
def open_ai(self):
|
10 |
+
return OpenAIEmbeddings(api_key=Secret(openai_api_key)) # type: ignore
|
11 |
+
|
12 |
+
|
13 |
+
embedding = EmbeddingClass()
|
_utils/langchain_utils/retriever.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from _utils.langchain_utils.vector_stores import vector_store
|
2 |
+
|
3 |
+
|
4 |
+
class Retriever:
|
5 |
+
def chroma_retriever(
|
6 |
+
self, lista_de_documents, search_type="similarity", search_kwargs={"k": 1}
|
7 |
+
):
|
8 |
+
retriever = vector_store.chroma(lista_de_documents).as_retriever(
|
9 |
+
search_type=search_type,
|
10 |
+
search_kwargs=search_kwargs,
|
11 |
+
)
|
12 |
+
return retriever
|
13 |
+
|
14 |
+
|
15 |
+
retriever = Retriever()
|
_utils/langchain_utils/vector_stores.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setup.easy_imports import Chroma
|
2 |
+
from _utils.langchain_utils.embeddings import embedding
|
3 |
+
|
4 |
+
|
5 |
+
class VectorStoreClass:
|
6 |
+
def chroma(self, lista_de_documents):
|
7 |
+
return Chroma.from_documents(lista_de_documents, embedding.open_ai())
|
8 |
+
|
9 |
+
|
10 |
+
vector_store = VectorStoreClass()
|
setup/easy_imports.py
CHANGED
@@ -11,14 +11,17 @@ from langchain_huggingface import HuggingFaceEmbeddings
|
|
11 |
|
12 |
# from langchain_community.embeddings import HuggingFaceEmbeddings
|
13 |
from langchain.prompts import PromptTemplate
|
|
|
14 |
from langchain_core.prompts import ChatPromptTemplate
|
15 |
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
|
16 |
from langchain_community.vectorstores import Chroma
|
17 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
18 |
|
19 |
# from langchain_community.chat_models import ChatOpenAI
|
20 |
-
from langchain_openai import ChatOpenAI
|
21 |
from langchain.schema import Document
|
22 |
from langchain.chains import create_extraction_chain
|
|
|
|
|
23 |
|
24 |
from rank_bm25 import BM25Okapi
|
|
|
11 |
|
12 |
# from langchain_community.embeddings import HuggingFaceEmbeddings
|
13 |
from langchain.prompts import PromptTemplate
|
14 |
+
from langchain_core.runnables import RunnablePassthrough
|
15 |
from langchain_core.prompts import ChatPromptTemplate
|
16 |
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
|
17 |
from langchain_community.vectorstores import Chroma
|
18 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
19 |
|
20 |
# from langchain_community.chat_models import ChatOpenAI
|
21 |
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
22 |
from langchain.schema import Document
|
23 |
from langchain.chains import create_extraction_chain
|
24 |
+
from langchain.chains.retrieval import create_retrieval_chain
|
25 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
26 |
|
27 |
from rank_bm25 import BM25Okapi
|
setup/tokens.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import cast
|
3 |
+
|
4 |
+
openai_api_key = cast(str, os.environ.get("OPENAI_API_KEY", ""))
|
5 |
+
claude_api_key = cast(str, os.environ.get("CLAUDE_API_KEY"))
|
6 |
+
langchain_api_key = cast(str, os.environ.get("LANGCHAIN_API_KEY"))
|
7 |
+
hugging_face_api_key = cast(str, os.environ.get("HUGGINGFACEHUB_API_TOKEN"))
|
8 |
+
bubble_token = cast(str, os.environ.get("BUBBLE_TOKEN"))
|
9 |
+
cohere_api_key = cast(str, os.environ.get("COHERE_API_KEY", ""))
|
10 |
+
deepseek_api_key = cast(str, os.environ.get("DEEPSEEKK_API_KEY"))
|
11 |
+
google_api_key = cast(str, os.environ.get("GOOGLE_API_KEY_PEIXE"))
|