File size: 5,219 Bytes
73b7e8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#import atexit
import gradio as gr
#from langchain.document_loaders import UnstructuredPDFLoader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone
import pinecone
import requests
import sys
#from langchain.prompts.chat import (ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate)
from langchain.chains.question_answering import load_qa_chain
#from langchain.chains import RetrievalQA
from langchain import PromptTemplate
from langchain import HuggingFaceHub
from PyPDF2 import PdfReader
#from langchain.document_loaders import TextLoader
#from sentence_transformers.util import semantic_search
from pathlib import Path
from time import sleep
#import pandas as pd
#import torch
import os
import random
import string

from dotenv import load_dotenv
load_dotenv()

file_path = os.path.join(os.getcwd(), "valuation.pdf")
#loader = PyPDFLoader("60LEADERSONAI.pdf")
#loader = PyPDFLoader(file_path)
#data = loader.load()
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
#db_texts = text_splitter.split_documents(data)

data = PdfReader(file_path)
raw_text = ''
db_texts=''
for i, page in enumerate(data.pages):
    text = page.extract_text()
    if text:
        raw_text += text
        text_splitter = RecursiveCharacterTextSplitter(        
#            separator = "\n",
            chunk_size = 1000,
            chunk_overlap  = 100, #striding over the text
            length_function = len,
        )
        db_texts = text_splitter.split_text(raw_text)

class HFEmbeddings:
    def __init__(self, api_url, headers):
        self.api_url = api_url
        self.headers = headers

    def get_embeddings(self, texts):
        response = requests.post(self.api_url, headers=self.headers, json={"inputs": texts, "options": {"wait_for_model": True}})
        embeddings = response.json()
        return embeddings

    def embed_documents(self, texts):
        embeddings = self.get_embeddings(texts)
        return embeddings

    def __call__(self, texts):
        return self.embed_documents(texts)

HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
model_id = os.getenv('model_id')
hf_token = os.getenv('hf_token')
repo_id = os.getenv('repo_id')

api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}

hf_embeddings = HFEmbeddings(api_url, headers)

#Pinecone账号:
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT')
PINECONE_INDEX_NAME = os.getenv('PINECONE_INDEX_NAME')
#def generate_random_string(length):
#    letters = string.ascii_letters
#    random_string = ''.join(random.choice(letters) for _ in range(length))
#    return random_string
#random_string = generate_random_string(8)

#def generate_random_string(length):
#    letters = string.ascii_lowercase
#    return ''.join(random.choice(letters) for i in range(length))      
#random_string = generate_random_string(8)

index_name = PINECONE_INDEX_NAME
#namespace = random_string
namespace = "HF-GRADIO-0909"

#def exit_handler():
#    pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
#    index_namespace_to_delete = pinecone.Index(index_name=index_name)
#    index_namespace_to_delete.delete(delete_all=True, namespace=namespace)

#atexit.register(exit_handler)

pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
vector_db = Pinecone.from_texts(db_texts, hf_embeddings, index_name=index_name, namespace=namespace)
#vector_db = Pinecone.from_texts([t.page_content for t in db_texts], hf_embeddings, index_name=index_name, namespace=namespace)
#docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name, namespace=namespace)
print("***********************************")
print("Pinecone Vector/Embedding DB Ready.")
print()

llm = HuggingFaceHub(repo_id=repo_id,
                     model_kwargs={"min_length":100,
                                   "max_new_tokens":1024, "do_sample":True,
                                   "temperature":0.1,
                                   "top_k":50,
                                   "top_p":0.95, "eos_token_id":49155})

chain = load_qa_chain(llm=llm, chain_type="stuff")

def run_chain(user_query):
    if user_query !="" and not user_query.strip().isspace() and not user_query.isspace():
      print("Your query:\n"+user_query)
      vector_db_from_index = Pinecone.from_existing_index(index_name, hf_embeddings, namespace=namespace)
      ss_results = vector_db_from_index.similarity_search(query=user_query, namespace=namespace, k=5)
      initial_ai_response = chain.run(input_documents=ss_results, question=user_query)
      temp_ai_response = initial_ai_response.partition('<|end|>')[0]
      final_ai_response = temp_ai_response.replace('\n', '')
      return final_ai_response
    else:
      print("Invalid inputs.")  

iface = gr.Interface(fn=run_chain, inputs="text", outputs="text", title="AI Response")
iface.launch()