Spaces:
Sleeping
Sleeping
"This file contains the implementation of the RAG pipeline." | |
from pathlib import Path | |
from haystack import Pipeline | |
from haystack.components.builders import PromptBuilder | |
from haystack.components.converters import MarkdownToDocument | |
from haystack.components.embedders import ( | |
SentenceTransformersDocumentEmbedder, | |
SentenceTransformersTextEmbedder, | |
) | |
from haystack.components.generators import HuggingFaceAPIGenerator | |
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter | |
from haystack.components.retrievers import InMemoryEmbeddingRetriever | |
from haystack.components.writers import DocumentWriter | |
from haystack.document_stores.in_memory import InMemoryDocumentStore | |
from haystack.utils import Secret | |
# Define the paths to the document and the model for embedding the documents and the user query | |
DOCUMENT_PATH = Path("gender_document.md") | |
EMBEDDING_MODEL = "all-MiniLM-L6-v2" | |
def process_document(document_store: InMemoryDocumentStore) -> Pipeline: | |
"""This function processes the document and stores it in the document store. | |
It contains of the following components: | |
- MarkdownToDocument: Converts the markdown file to a document (https://docs.haystack.deepset.ai/docs/markdowntodocument) | |
- DocumentCleaner: Cleans the document (https://docs.haystack.deepset.ai/docs/documentcleaner) | |
- DocumentSplitter: Splits the document into chunks (https://docs.haystack.deepset.ai/docs/documentsplitter) | |
- DocumentWriter: Writes the document to the document store (https://docs.haystack.deepset.ai/docs/documentwriter) | |
- SentenceTransformersDocumentEmbedder: Embeds the documents, more precisely the chunks (https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder) | |
Parameters | |
---------- | |
document_store : InMemoryDocumentStore | |
The document store where the processed document should be stored. | |
Returns | |
------- | |
Pipeline | |
The pipeline containing the components to parse, clean, split, embed and write the document to the document store. | |
To run the pipeline, you can use the `pipeline.run()` method. If a component needs input or arguments, you can pass them as a dictionary to the `run()` method. | |
For example: `pipeline.run({"converter": {"sources": [DOCUMENT_PATH]}})`. | |
""" | |
# initialize the pipeline | |
pipeline = Pipeline() | |
# add the components to the pipeline. If you want to add more components, you can do it here. | |
# If you want to the settings of the components, you can do it here. | |
# MarkdownToDocument | |
pipeline.add_component("converter", MarkdownToDocument()) | |
# DocumentCleaner | |
pipeline.add_component("cleaner", DocumentCleaner()) | |
# DocumentSplitter | |
pipeline.add_component( | |
"splitter", | |
DocumentSplitter( | |
split_by="word", split_length=300, respect_sentence_boundary=True | |
), | |
) | |
# DocumentWriter | |
pipeline.add_component("writer", DocumentWriter(document_store=document_store)) | |
# SentenceTransformersDocumentEmbedder | |
pipeline.add_component( | |
"embedder", | |
SentenceTransformersDocumentEmbedder( | |
EMBEDDING_MODEL, | |
), | |
) | |
# connect the components | |
pipeline.connect("converter", "cleaner") | |
pipeline.connect("cleaner", "splitter") | |
pipeline.connect("splitter", "embedder") | |
pipeline.connect("embedder", "writer") | |
return pipeline | |
def load_document_store(document_store_settings: dict) -> InMemoryDocumentStore: | |
"""This function loads the document store with the given settings. | |
Parameters | |
---------- | |
document_store_settings : dict | |
The settings for the document store. The settings are passed as a dictionary. | |
You can find the available settings here: https://docs.haystack.deepset.ai/docs/inmemorydocumentstore | |
Returns | |
------- | |
InMemoryDocumentStore | |
_description_ | |
""" | |
document_store = InMemoryDocumentStore(**document_store_settings) | |
return document_store | |
def get_query_pipeline( | |
document_store: InMemoryDocumentStore, generator: HuggingFaceAPIGenerator | |
) -> Pipeline: | |
""" | |
This function creates a query pipeline that contains the following components: | |
- SentenceTransformersTextEmbedder: Embeds the user query (https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder) | |
- InMemoryEmbeddingRetriever: Retrieves the most similar documents to the user query (https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever) | |
- PromptBuilder: Builds the prompt for the generator (https://docs.haystack.deepset.ai/docs/promptbuilder) | |
- HuggingFaceAPIGenerator: Generates the answer to the user query (https://docs.haystack.deepset.ai/docs/huggingfaceapigenerator) | |
Parameters | |
---------- | |
document_store : InMemoryDocumentStore | |
The document store where the documents are stored. | |
llm_provider : HuggingFaceAPIGenerator | |
The llm_provider that generates the answer to the user query. | |
Returns | |
------- | |
Pipeline | |
The query pipeline containing the components to embed the user query, retrieve the most similar documents, build the prompt and generate the answer. | |
""" | |
# initialize the query pipeline | |
query_pipeline = Pipeline() | |
# add the components to the query pipeline | |
# SentenceTransformersTextEmbedder | |
query_pipeline.add_component( | |
"text_embedder", SentenceTransformersTextEmbedder(EMBEDDING_MODEL) | |
) | |
# InMemoryEmbeddingRetriever | |
query_pipeline.add_component( | |
"retriever", InMemoryEmbeddingRetriever(document_store=document_store, top_k=10) | |
) | |
# Template für den PromptBuilder | |
template = """ | |
You are an expert on gender strategies and sustainable development. Your task is to provide detailed, well-structured, and informative answers based on the given context. | |
### Instructions: | |
- Provide a **comprehensive** and **well-structured** response. | |
- Include **specific details, key concepts, and relevant examples** where applicable. | |
- Explain **how and why** aspects of the Gender Strategy are relevant to the given question. | |
- If necessary, cite relevant sections from the provided context. | |
- If the available information is insufficient, state clearly: **"The available information does not provide a full answer."** However, summarize the most relevant points that can still help address the question. | |
### Context: | |
{% for document in documents %} | |
{{ document.content }} | |
{% endfor %} | |
### Question: | |
{{ query }} | |
### Answer: | |
""" | |
# PromptBuilder | |
query_pipeline.add_component("prompt_builder", PromptBuilder(template=template)) | |
# HuggingFaceAPIGenerator | |
query_pipeline.add_component("llm", generator) | |
# connect the components | |
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") | |
query_pipeline.connect("retriever", "prompt_builder.documents") | |
query_pipeline.connect("prompt_builder", "llm") | |
return query_pipeline | |
def init_generator() -> HuggingFaceAPIGenerator: | |
"""This function initializes the HuggingFaceAPIGenerator with the given settings. | |
You can find the available models here: https://huggingface.co/models?inference=warm&pipeline_tag=text-generation&sort=trending | |
Please note that you need to provide a valid token to use the HuggingFaceAPIGenerator. | |
For testing purposes, you can hardcode the token in the script. | |
For deployment on Hugging Face Spaces, please safe the token as a secret (Settings -> Secrets) and load it with `Secret.from_env_var("your_token_name")`. | |
Returns | |
------- | |
HuggingFaceAPIGenerator | |
_description_ | |
""" | |
# initialize the HuggingFaceAPIGenerator | |
llm_provider = HuggingFaceAPIGenerator( | |
api_type="serverless_inference_api", | |
api_params={"model": "HuggingFaceH4/zephyr-7b-beta", "stop": ["Question"]}, | |
#token=Secret.from_token(""), | |
token=Secret.from_env_var("hftoken"), | |
) | |
return llm_provider | |
def rag_pipeline() -> Pipeline: | |
"""This function wraps the whole RAG pipeline. | |
It loads the document store, processes the document, initializes the generator and | |
creates the query pipeline. | |
Returns | |
------- | |
Pipeline | |
The RAG pipeline containing the components to process the document and generate | |
the answer to the user query. It is enough to import and load this function for the chat application. | |
You can run the pipeline with the `pipeline.run()` method. | |
If a component needs input or arguments, you can pass them as a dictionary to the `run()` method. | |
For example: | |
result = rag.run( | |
{"prompt_builder": {"query": prompt}, "text_embedder": {"text": prompt}}, | |
) | |
For debugging purposes, you can include the outputs for example from the retriever | |
result = rag.run( | |
{"prompt_builder": {"query": prompt}, "text_embedder": {"text": prompt}}, | |
include_outputs_from=["retriever", "llm"], | |
) | |
""" | |
# define document_store_settings | |
document_store_settings = {"embedding_similarity_function": "cosine"} | |
# load the document store | |
document_store = load_document_store(document_store_settings) | |
# process the document and write it to the document store | |
document_pipeline = process_document(document_store=document_store) | |
# run the document pipeline | |
document_pipeline.run({"converter": {"sources": [DOCUMENT_PATH]}}) | |
# initialize the generator | |
llm_provider = init_generator() | |
# create the query pipeline | |
query_pipeline = get_query_pipeline( | |
document_store=document_store, generator=llm_provider | |
) | |
return query_pipeline | |