File size: 4,178 Bytes
3e299e4
 
 
7f184fa
3e299e4
 
 
 
7f184fa
3e299e4
 
 
 
7f184fa
3e299e4
 
 
 
 
7f184fa
3e299e4
7f184fa
 
3e299e4
7f184fa
3e299e4
7f184fa
 
 
 
 
3e299e4
7f184fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e299e4
7f184fa
 
3e299e4
 
 
 
 
 
 
 
 
 
 
7f184fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e299e4
7f184fa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
os.environ['MISTRAL_API_KEY'] = "i5jSJkCFNGKfgIztloxTMjfckiFbYBj4"
# os.environ['OPENAI_API_KEY'] = "sk-proj-2WJfO8JpVyrdIeJ8QsO0T3BlbkFJWLhZF1xMlRZVFjNBccWh"
os.environ['TAVILY_API_KEY'] = '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'

mistral_api_key = os.getenv("MISTRAL_API_KEY")
tavily_api_key = os.getenv("TAVILY_API_KEY")
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma, FAISS
from langchain_mistralai import MistralAIEmbeddings
from langchain import hub
from typing import Literal
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_mistralai import ChatMistralAI
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.tools import DuckDuckGoSearchRun

# urls = [
#     "https://www.toutelanutrition.com/wikifit/guide-nutrition/nutrition-sportive/apports-proteines",

# ]

# docs = [WebBaseLoader(url).load() for url in urls]
# docs_list = [item for sublist in docs for item in sublist]

# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
#     chunk_size=250, chunk_overlap=0
# )
# doc_splits = text_splitter.split_documents(docs_list)

####### PDF
def load_chunk_persist_pdf() -> Chroma:
    pdf_folder_path = "data/pdf_folder/"
    documents = []
    for file in os.listdir(pdf_folder_path):
        if file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder_path, file)
            loader = PyPDFLoader(pdf_path)
            documents.extend(loader.load())
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
    chunked_documents = text_splitter.split_documents(documents)

    vectorstore = Chroma.from_documents(
        documents=chunked_documents,
        embedding=MistralAIEmbeddings(),
        persist_directory="data/chroma_store/"
    )
    vectorstore.persist()
    return vectorstore

# from langchain_community.document_loaders import PyPDFLoader
# loader = PyPDFLoader('data/fitness_programs/ZeroToHero.pdf')
# pages = loader.load_and_split()

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# splits = text_splitter.split_documents(pages)
# vectorstore = Chroma.from_documents(documents=splits, embedding=MistralAIEmbeddings())
vectorstore = load_chunk_persist_pdf()
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")


# Data model
class RouteQuery(BaseModel):
    """Route a user query to the most relevant datasource."""

    datasource: Literal["vectorstore", "websearch"] = Field(
        ...,
        description="Given a user question choose to route it to web search or a vectorstore.",
    )

# LLM with function call 
llm = ChatMistralAI(model="mistral-large-latest", mistral_api_key=mistral_api_key, temperature=0)

# structured_llm_router = llm.with_structured_output(RouteQuery, method="json_mode")

# Prompt 
system = """You are an expert at routing a user question to a vectorstore or web search.
The vectorstore contains documents related to agents, prompt engineering, and adversarial attacks.
Use the vectorstore for questions on these topics. For all else, use web-search."""
route_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)
prompt = hub.pull("rlm/rag-prompt")
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)
    
    
rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

print(rag_chain.invoke("Build a fitness program for me. Be precise in terms of exercises"))

# print(rag_chain.invoke("I am a 45 years old woman and I have to loose weight for the summer. Provide me with a fitness program"))