added embeddings for new books
Browse files- Makefile +5 -1
- data/ai_books/index.faiss +2 -2
- data/ai_books/index.pkl +2 -2
- ingest.py +7 -6
Makefile
CHANGED
|
@@ -50,7 +50,11 @@ format:
|
|
| 50 |
black .
|
| 51 |
|
| 52 |
install:
|
| 53 |
-
pip install -
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
pip show langchain transformers
|
| 55 |
|
| 56 |
install-extra:
|
|
|
|
| 50 |
black .
|
| 51 |
|
| 52 |
install:
|
| 53 |
+
pip install -r requirements.txt
|
| 54 |
+
pip show langchain transformers
|
| 55 |
+
|
| 56 |
+
install:
|
| 57 |
+
pip install -r requirements-mac.txt
|
| 58 |
pip show langchain transformers
|
| 59 |
|
| 60 |
install-extra:
|
data/ai_books/index.faiss
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:57cf906d0a49d48c53ef8bfe9c107d035d2f0a15bd4e57a2d8f5560960db239f
|
| 3 |
+
size 110456877
|
data/ai_books/index.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5091df974d4a7c0c832619b0acaae195fa69ab37f7cd18873459c11c3a537494
|
| 3 |
+
size 37484917
|
ingest.py
CHANGED
|
@@ -3,8 +3,8 @@ import os
|
|
| 3 |
from timeit import default_timer as timer
|
| 4 |
from typing import List
|
| 5 |
|
| 6 |
-
from
|
| 7 |
-
from
|
| 8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 9 |
from langchain.vectorstores.base import VectorStore
|
| 10 |
from langchain.vectorstores.chroma import Chroma
|
|
@@ -81,7 +81,7 @@ if not os.path.isdir(index_path):
|
|
| 81 |
)
|
| 82 |
os.mkdir(index_path)
|
| 83 |
|
| 84 |
-
if source_urls
|
| 85 |
# Open the file for reading
|
| 86 |
file = open(source_urls, "r")
|
| 87 |
|
|
@@ -93,10 +93,11 @@ if not os.path.isdir(index_path):
|
|
| 93 |
|
| 94 |
# Remove the newline characters from each string
|
| 95 |
source_urls = [line.strip() for line in lines]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
-
print(
|
| 98 |
-
f"Loading {'' if source_urls is None else str(len(source_urls)) + ' '}PDF files from {source_pdfs_path}"
|
| 99 |
-
)
|
| 100 |
sources = load_documents(source_pdfs_path, source_urls)
|
| 101 |
|
| 102 |
print(f"Splitting {len(sources)} PDF pages in to chunks ...")
|
|
|
|
| 3 |
from timeit import default_timer as timer
|
| 4 |
from typing import List
|
| 5 |
|
| 6 |
+
from langchain_community.document_loaders import PyPDFDirectoryLoader
|
| 7 |
+
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
|
| 8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 9 |
from langchain.vectorstores.base import VectorStore
|
| 10 |
from langchain.vectorstores.chroma import Chroma
|
|
|
|
| 81 |
)
|
| 82 |
os.mkdir(index_path)
|
| 83 |
|
| 84 |
+
if source_urls:
|
| 85 |
# Open the file for reading
|
| 86 |
file = open(source_urls, "r")
|
| 87 |
|
|
|
|
| 93 |
|
| 94 |
# Remove the newline characters from each string
|
| 95 |
source_urls = [line.strip() for line in lines]
|
| 96 |
+
print(f"Loading {len(source_urls)} PDF files from {source_pdfs_path}")
|
| 97 |
+
else:
|
| 98 |
+
source_urls = None
|
| 99 |
+
print(f"Loading PDF files from {source_pdfs_path}")
|
| 100 |
|
|
|
|
|
|
|
|
|
|
| 101 |
sources = load_documents(source_pdfs_path, source_urls)
|
| 102 |
|
| 103 |
print(f"Splitting {len(sources)} PDF pages in to chunks ...")
|