File size: 1,781 Bytes
6c945f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# -*-coding:utf-8 -*-
import os
from tqdm import tqdm
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings, CohereEmbeddings
from retry import retry
from key import CoherenceKey, OpenaiKey
# Output Directory for FAISS Index data
OUTPUT_DIR = './output/'
@retry(tries=10, delay=60)
def store_add_texts_with_retry(store, i):
store.add_texts([i.page_content], metadatas=[i.metadata])
def doc2vec(docs, model, folder_name=None):
if folder_name:
dir = os.path.join(OUTPUT_DIR, folder_name)
# use first document to init db, 1个1个文件处理避免中间出现问题需要重头尝试
print(f'Building faiss Index from {len(docs)} docs')
docs_test = [docs[0]]
index = 0
print(f'Dumping FAISS to {dir}')
if model =='openai':
key = os.getenv('OPENAI_API_KEY')
db = FAISS.from_documents(docs_test, OpenAIEmbeddings(openai_api_key=key))
elif model =='mpnet':
db = FAISS.from_documents(docs_test, HuggingFaceEmbeddings())
elif model =='cohere':
db = FAISS.from_documents(docs_test, CohereEmbeddings(cohere_api_key=CoherenceKey))
raise ValueError(f'Embedding Model {model} not supported')
for doc in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs),
bar_format='{l_bar}{bar}| Time Left: {remaining}'):
store_add_texts_with_retry(db, doc)
except Exception as e:
print("Error on ", doc)
print("Saving progress")
print(f"stopped at {index} out of {len(docs)}")
index += 1
db.save_local(dir) |