transformers-chat / ingest.py
enoreyes's picture
Update code
fa8c8ef
raw
history blame
1.86 kB
"""Load html from files, clean up, split, ingest into Weaviate."""
import os
from pathlib import Path
from markdown import markdown
import pickle
from bs4 import BeautifulSoup
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings, OpenAIEmbeddings
from langchain.vectorstores import FAISS
from InstructorEmbedding import INSTRUCTOR
print(os.environ["HUGGINFACE_APIKEY"])
def clean_data(data):
html = markdown(data)
soup = BeautifulSoup(html, "html.parser")
text = ''.join(soup.findAll(text=True))
return "\n".join([t for t in text.split("\n") if t])
docs = []
metadatas = []
for p in Path("docs").rglob("*"):
if p.is_dir():
continue
if str(p).lower().endswith(('.md', '.mdx')):
with open(p) as f:
filename = os.path.splitext(p)[0]
docs.append(clean_data(f.read()))
newfile_name = filename.replace("\\", "/")[5:]
print("file:" + newfile_name)
metadatas.append({"source": newfile_name})
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=768,
chunk_overlap=128,
length_function=len,
)
documents = text_splitter.create_documents(docs, metadatas=metadatas)
print("making embedding")
model_name = "hkunlp/instructor-large"
embed_instruction = "Represent the text from the Hugging Face code documentation"
query_instruction = "Query the most relevant text from the Hugging Face code documentation"
embedding = HuggingFaceInstructEmbeddings(model_name=model_name, embed_instruction=embed_instruction, query_instruction=query_instruction)
print("beginning construction of faiss")
search_index = FAISS.from_documents(documents, embedding)
print("beginning pickle")
with open("docs.pkl", 'wb') as f:
pickle.dump(search_index, f)
print("Pickle complete")