File size: 4,558 Bytes
a2ee974
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Created by Leandro Carneiro at 19/01/2024
# Description: 
# ------------------------------------------------
#from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
import os
import csv

import api_key

def read_csv_to_dict(filename):
    data_dict = {}
    with open(filename, mode='r', encoding='utf-8') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            key, value = row[0].split(';')
            data_dict[key] = value
    return data_dict

def generate_embeddings_and_vectorstore(path):
    try:
        loader = DirectoryLoader(path=path, glob="**/*.txt")
        corpus = loader.load()
        print(f'    Total de documentos antes do text_split = {len(corpus)}')

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=400)
        docs = text_splitter.split_documents(corpus)
        num_total_characters = sum([len(x.page_content) for x in docs])
        print(f"    Total de chunks depois do text_split = {len(docs)}")
        print(f"    Média de caracteres por chunk = {num_total_characters / len(docs):,.0f}")

        dict_filename_url = read_csv_to_dict('./local_base/filename_url.csv')
        for doc in docs:
            filename = os.path.basename(doc.metadata["source"])
            doc.metadata["link"] = dict_filename_url.get(filename)

        #print('docs')
        #print(docs)

        openai_api_key = api_key.OPENAI_KEY
        fc_embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
        vectorstore = Chroma.from_documents(docs, fc_embeddings)

        return vectorstore
    except Exception as e:
        print(str(e))
        return str(e)

class Rag:
    def __init__(self, vectorstore, min_words, max_words):
        self.text = None
        self.vectorstore = vectorstore
        self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")

        prompt_template = """Your task is to create news to a newspaper based on pieces of texts delimited by <> and a question delimited by <>.
                    Do not make up any information, create the news just based on the given information on the pieces of texts delimited by <>.
                    The news should have a tittle.
                    The news should be written in a formal language.
                    The news should have between {min_words} and {max_words} words and it should be in portuguese language.
                    The news should be about the following context: <{context}>
                    Question: <{question}>
                    Answer here:"""
        self.prompt = PromptTemplate(template=prompt_template,
                                     input_variables=["context", "question"],
                                     partial_variables={"min_words": min_words, "max_words": max_words})

        self.qa = ConversationalRetrievalChain.from_llm(
                    llm=ChatOpenAI(model_name="gpt-3.5-turbo",
                                   temperature=0.3,
                                   openai_api_key=api_key.OPENAI_KEY,
                                   max_tokens=int(int(max_words) + (int(max_words) / 2))), #número máximo de tokens para a resposta
                    memory=self.memory,
                    retriever=vectorstore.as_retriever(), #search_kwargs={'k': 3}
                    combine_docs_chain_kwargs={"prompt": self.prompt},
                    chain_type="stuff",#map_reduce, refine, map_rerank
                    return_source_documents=True,
                )
    def generate_text(self, subject):
        query = f"Elabore uma nova notícia sobre {subject}."
        result_text = self.qa.invoke({"question": query})

        list_result_sources = []
        str_result_sources = ''
        for doc in result_text["source_documents"]:
            list_result_sources.append(doc.metadata['link'])
        result_sources = list(set(list_result_sources))
        for i in range(len(result_sources)):
            str_result_sources += f'{i + 1}) {result_sources[i]}' + '\n'

        return (result_text["answer"], str_result_sources)