File size: 3,287 Bytes
ad87194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate
from core.prompts.custom_prompts import _custom_prompts
from core.services.answer_query.answerquery import AnswerQuery
from core.services.document.add_document import AddDocument
from core.services.embeddings.Qdrant_BM25_embedding import qdrant_bm25_embedding
from core.services.embeddings.jina_embeddings import jina_embedding
from core.services.get_links.web_scraper import WebScraper
from core.services.ocr.replicate_ocr.replicate_ocr import ReplicateOCR as OCRService
from core.services.pdf_extraction.image_pdf.image_pdf_text_extraction import get_text_from_image_pdf
from core.services.pdf_extraction.text_pdf.text_pdf_extraction import extract_text_from_pdf
from core.services.website_url.text_extraction_urlsnew import WebScrapertext
from core.utils.utils import json_parser


class ChatAIPipeline:
    def __init__(self):
        prompt_template = _custom_prompts["RAG_ANSWER_PROMPT"]
        follow_up_prompt_template = _custom_prompts["FOLLOW_UP_PROMPT"]
        prompt = ChatPromptTemplate.from_template(prompt_template)
        json_parser_ = json_parser()
        follow_up_prompt = PromptTemplate(
            template=follow_up_prompt_template,
            input_variables=["context"],
            partial_variables={"format_instructions": json_parser_.get_format_instructions()},
        )
        self.vector_embedding = jina_embedding()
        self.sparse_embedding = qdrant_bm25_embedding()
        self.add_document_service = AddDocument(self.vector_embedding, self.sparse_embedding)

        self.answer_query_service = AnswerQuery(vector_embedding=self.vector_embedding,
                                                sparse_embedding=self.sparse_embedding, prompt=prompt,
                                                follow_up_prompt=follow_up_prompt, json_parser=json_parser_)
        self.get_website_links = WebScraper()
        self.ocr_service = OCRService()
        self.web_text_extractor = WebScrapertext()

    def add_document_(self, texts: list[tuple[str]], vectorstore: str):
        return self.add_document_service.add_documents(texts=texts, vectorstore=vectorstore)

    def answer_query_(self, query: str, vectorstore: str, llm_model: str = "llama-3.3-70b-versatile"):
        output, follow_up_questions, source = self.answer_query_service.answer_query(query=query,
                                                                                     vectorstore=vectorstore,
                                                                                     llmModel=llm_model)
        return output, follow_up_questions, source

    def get_links_(self, url: str, timeout: int):
        return self.get_website_links.get_links(url=url, timeout=timeout)

    def image_pdf_text_extraction_(self, image_pdf: bytes):
        return get_text_from_image_pdf(pdf_bytes=image_pdf)

    def text_pdf_extraction_(self, pdf: str):
        return extract_text_from_pdf(pdf_path=pdf)

    def website_url_text_extraction_(self, url: str):
        return self.web_text_extractor.extract_text_from_url(url=url)

    def website_url_text_extraction_list_(self, urls: list):
        return self.web_text_extractor.extract_text_from_urls(urls=urls)