File size: 1,781 Bytes
6c945f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# -*-coding:utf-8 -*-
import os
from tqdm import tqdm
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings, CohereEmbeddings
from retry import retry
from key import CoherenceKey, OpenaiKey

# Output Directory for FAISS Index data
OUTPUT_DIR = './output/'


@retry(tries=10, delay=60)
def store_add_texts_with_retry(store, i):
    store.add_texts([i.page_content], metadatas=[i.metadata])


def doc2vec(docs, model, folder_name=None):
    if folder_name:
        dir = os.path.join(OUTPUT_DIR, folder_name)
    else:
        dir = OUTPUT_DIR
    # use first document to init db, 1个1个文件处理避免中间出现问题需要重头尝试
    print(f'Building faiss Index from {len(docs)} docs')
    docs_test = [docs[0]]
    docs.pop(0)
    index = 0

    print(f'Dumping FAISS to {dir}')
    if model =='openai':
        key = os.getenv('OPENAI_API_KEY')
        db = FAISS.from_documents(docs_test, OpenAIEmbeddings(openai_api_key=key))
    elif model =='mpnet':
        db = FAISS.from_documents(docs_test, HuggingFaceEmbeddings())
    elif model =='cohere':
        db = FAISS.from_documents(docs_test, CohereEmbeddings(cohere_api_key=CoherenceKey))
    else:
        raise ValueError(f'Embedding Model {model} not supported')

    for doc in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs),
                  bar_format='{l_bar}{bar}| Time Left: {remaining}'):
        try:
            store_add_texts_with_retry(db, doc)
        except Exception as e:
            print(e)
            print("Error on ", doc)
            print("Saving progress")
            print(f"stopped at {index} out of {len(docs)}")
            db.save_local(dir)
            break
        index += 1
    db.save_local(dir)