Xalt8 commited on
Commit
a206783
·
1 Parent(s): 31ec9f8

vector store for windows

Browse files
core-langchain-rag.py CHANGED
@@ -214,7 +214,7 @@ def generate_qa_retriever(history: dict, question: str, llm_model:HuggingFaceEnd
214
  template = """
215
  You are a friendly insurance product advisor, your task is to help customers find the best products from Württembergische GmbH.\
216
  You help the user find the answers to all his questions. Answer in short and simple terms and offer to explain the product and terms to the user.\
217
- Respond only using the provided context (delimited by <ctx></ctx>) and only in German or Englisch, depending on the question's language.
218
  Use the chat history (delimited by <hs></hs>) to help find the best product for the user:
219
  ------
220
  <ctx>
 
214
  template = """
215
  You are a friendly insurance product advisor, your task is to help customers find the best products from Württembergische GmbH.\
216
  You help the user find the answers to all his questions. Answer in short and simple terms and offer to explain the product and terms to the user.\
217
+ Respond only using the provided context (delimited by <ctx></ctx>) and only in German or English, depending on the question's language.
218
  Use the chat history (delimited by <hs></hs>) to help find the best product for the user:
219
  ------
220
  <ctx>
rag_app/__init__.py ADDED
File without changes
rag_app/load_vector_stores.py CHANGED
@@ -10,6 +10,7 @@ from dotenv import load_dotenv
10
  import os
11
  import sys
12
  import logging
 
13
 
14
  # Load environment variables from a .env file
15
  config = load_dotenv(".env")
@@ -38,6 +39,7 @@ def get_faiss_vs():
38
 
39
  # Define the destination for the downloaded file
40
  VS_DESTINATION = FAISS_INDEX_PATH + ".zip"
 
41
  try:
42
  # Download the pre-prepared vectorized index from the S3 bucket
43
  print("Downloading the pre-prepared vectorized index from S3...")
@@ -51,7 +53,32 @@ def get_faiss_vs():
51
 
52
  except Exception as e:
53
  print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
54
- #faissdb = FAISS.load_local(FAISS_INDEX_PATH, embeddings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
 
57
  ## Chroma DB
@@ -67,4 +94,10 @@ def get_chroma_vs():
67
  chromadb = Chroma(persist_directory=CHROMA_DIRECTORY, embedding_function=embeddings)
68
  chromadb.get()
69
  except Exception as e:
70
- print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
 
 
 
 
 
 
 
10
  import os
11
  import sys
12
  import logging
13
+ from pathlib import Path
14
 
15
  # Load environment variables from a .env file
16
  config = load_dotenv(".env")
 
39
 
40
  # Define the destination for the downloaded file
41
  VS_DESTINATION = FAISS_INDEX_PATH + ".zip"
42
+
43
  try:
44
  # Download the pre-prepared vectorized index from the S3 bucket
45
  print("Downloading the pre-prepared vectorized index from S3...")
 
53
 
54
  except Exception as e:
55
  print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
56
+ # faissdb = FAISS.load_local(FAISS_INDEX_PATH, embeddings)
57
+
58
+
59
+ def get_faiss_vs_from_s3(s3_loc:str,
60
+ s3_vs_name:str,
61
+ vs_dir:str='vectorstore') -> None:
62
+ """ Download the FAISS vector store from S3 bucket
63
+
64
+ Args:
65
+ s3_loc (str): Name of the S3 bucket
66
+ s3_vs_name (str): Name of the file to be downloaded
67
+ vs_dir (str): The name of the directory where the file is to be saved
68
+ """
69
+ # Initialize an S3 client with unsigned configuration for public access
70
+ s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
71
+ # Destination folder
72
+ vs_dir_path = Path("..") / vs_dir
73
+ assert vs_dir_path.is_dir(), "Cannot find vs_dir folder"
74
+ try:
75
+ vs_destination = Path("..") / vs_dir / "faiss-insurance-agent-500.zip"
76
+ s3.download_file(s3_loc, s3_vs_name, vs_destination)
77
+ # Extract the downloaded zip file
78
+ with zipfile.ZipFile(file=vs_destination, mode='r') as zip_ref:
79
+ zip_ref.extractall(path=vs_dir_path.as_posix())
80
+ except Exception as e:
81
+ print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
82
 
83
 
84
  ## Chroma DB
 
94
  chromadb = Chroma(persist_directory=CHROMA_DIRECTORY, embedding_function=embeddings)
95
  chromadb.get()
96
  except Exception as e:
97
+ print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
98
+
99
+
100
+ if __name__ == "__main__":
101
+ # get_faiss_vs_from_s3(s3_loc=S3_LOCATION, s3_vs_name=FAISS_VS_NAME)
102
+ pass
103
+
rag_app/metadata.ipynb ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from pathlib import Path\n",
10
+ "from langchain_community.vectorstores import FAISS\n",
11
+ "from dotenv import load_dotenv\n",
12
+ "import os\n",
13
+ "from langchain_huggingface import HuggingFaceEmbeddings"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 3,
19
+ "metadata": {},
20
+ "outputs": [
21
+ {
22
+ "data": {
23
+ "text/plain": [
24
+ "True"
25
+ ]
26
+ },
27
+ "execution_count": 3,
28
+ "metadata": {},
29
+ "output_type": "execute_result"
30
+ }
31
+ ],
32
+ "source": [
33
+ "load_dotenv()"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 5,
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFCEHUB_API_TOKEN')\n",
43
+ "EMBEDDING_MODEL = os.getenv(\"EMBEDDING_MODEL\")"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": null,
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": 7,
58
+ "metadata": {},
59
+ "outputs": [],
60
+ "source": [
61
+ "folder_path = Path('..') / \"vectorstore/faiss-insurance-agent-500\"\n",
62
+ "faissdb = FAISS.load_local(folder_path=str(folder_path.resolve()),\n",
63
+ " embeddings=embeddings,\n",
64
+ " allow_dangerous_deserialization=True) "
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 24,
70
+ "metadata": {},
71
+ "outputs": [
72
+ {
73
+ "name": "stdout",
74
+ "output_type": "stream",
75
+ "text": [
76
+ "Content: Die private Haftpflichtversicherung...\n",
77
+ "Metadata: {'source': 'https://www.wuerttembergische.de/versicherungen/stadt/wuppertal/', 'content_type': 'text/html; charset=UTF-8', 'title': 'Versicherung in Wuppertal', 'description': 'Ihre Versicherungsagentur in Wuppertal: Kommen Sie zur Württembergischen Versicherung und profitieren Sie von einer persönlichen Beratung und ausgezeichnetem Service. ', 'language': 'de'}\n",
78
+ "---\n",
79
+ "Content: Haftpflichtversicherung...\n",
80
+ "Metadata: {'source': 'https://www.wuerttembergische.de/wohnen/hausratversicherung/sengschaden/', 'content_type': 'text/html; charset=UTF-8', 'title': 'Sengschäden: So schützt Sie Ihre Hausrat- und Wohngebäudeversicherung', 'description': 'Deckt Ihre Hausratversicherung Sengschäden ab? Finden Sie heraus, wie Sie bei Schäden durch Glut oder Hitze ohne direktes Feuer geschützt sind.\\n', 'language': 'de'}\n",
81
+ "---\n",
82
+ "Content: Die Leistungen unserer privaten Haftpflichtversich...\n",
83
+ "Metadata: {'source': 'https://www.wuerttembergische.de/existenz/private-haftpflichtversicherung/drohnen-versichern/', 'content_type': 'text/html; charset=UTF-8', 'title': 'Drohnen über die private Haftpflicht versichern', 'description': 'Müssen Drohnen versichert sein? Welcher Tarif ist der beste? Erfahren Sie hier die wichtigsten Informationen rund ums Thema Drohne versichern.', 'language': 'de'}\n",
84
+ "---\n",
85
+ "Content: Das kann ohne private Haftpflichtversicherung pass...\n",
86
+ "Metadata: {'source': 'https://www.wuerttembergische.de/existenz/private-haftpflichtversicherung/pflicht/', 'content_type': 'text/html; charset=UTF-8', 'title': 'Ist die private Haftpflichtversicherung Pflicht oder freiwillig?', 'description': 'Ist eine Privathaftpflichtversicherung gesetzlich vorgeschrieben? Welche Haftpflichtversicherung Pflicht sind und welche freiwillig - das erfahren Sie hier.', 'language': 'de'}\n",
87
+ "---\n",
88
+ "Content: Private Haftpflicht: keine Pflichtversicherung\n",
89
+ "Fre...\n",
90
+ "Metadata: {'source': 'https://www.wuerttembergische.de/existenz/private-haftpflichtversicherung/pflicht/', 'content_type': 'text/html; charset=UTF-8', 'title': 'Ist die private Haftpflichtversicherung Pflicht oder freiwillig?', 'description': 'Ist eine Privathaftpflichtversicherung gesetzlich vorgeschrieben? Welche Haftpflichtversicherung Pflicht sind und welche freiwillig - das erfahren Sie hier.', 'language': 'de'}\n",
91
+ "---\n"
92
+ ]
93
+ }
94
+ ],
95
+ "source": [
96
+ "# Perform a similarity search with an empty query to get random documents\n",
97
+ "documents = faissdb.similarity_search(\"Private Haftpflicht­versicherung\", k=5)\n",
98
+ "\n",
99
+ "for doc in documents:\n",
100
+ " print(f\"Content: {doc.page_content[:50]}...\") # Print first 50 chars of content\n",
101
+ " print(f\"Metadata: {doc.metadata}\")\n",
102
+ " print(\"---\")"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": 19,
108
+ "metadata": {},
109
+ "outputs": [
110
+ {
111
+ "name": "stdout",
112
+ "output_type": "stream",
113
+ "text": [
114
+ "Number of entries in the database: 62496\n"
115
+ ]
116
+ }
117
+ ],
118
+ "source": [
119
+ "num_entries = len(faissdb.index_to_docstore_id)\n",
120
+ "print(f\"Number of entries in the database: {num_entries}\")"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": 20,
126
+ "metadata": {},
127
+ "outputs": [
128
+ {
129
+ "name": "stdout",
130
+ "output_type": "stream",
131
+ "text": [
132
+ "Number of entries in the database: 62496\n"
133
+ ]
134
+ }
135
+ ],
136
+ "source": [
137
+ "num_entries = faissdb.index.ntotal\n",
138
+ "print(f\"Number of entries in the database: {num_entries}\")"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type": "code",
143
+ "execution_count": null,
144
+ "metadata": {},
145
+ "outputs": [],
146
+ "source": []
147
+ }
148
+ ],
149
+ "metadata": {
150
+ "kernelspec": {
151
+ "display_name": "venv",
152
+ "language": "python",
153
+ "name": "python3"
154
+ },
155
+ "language_info": {
156
+ "codemirror_mode": {
157
+ "name": "ipython",
158
+ "version": 3
159
+ },
160
+ "file_extension": ".py",
161
+ "mimetype": "text/x-python",
162
+ "name": "python",
163
+ "nbconvert_exporter": "python",
164
+ "pygments_lexer": "ipython3",
165
+ "version": "3.11.4"
166
+ }
167
+ },
168
+ "nbformat": 4,
169
+ "nbformat_minor": 2
170
+ }
rag_app/metadata_filtering.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from langchain_community.vectorstores import FAISS
3
+ from dotenv import load_dotenv
4
+ import os
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+
7
+
8
+ load_dotenv(".env")
9
+
10
+ HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
11
+ EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
12
+
13
+
14
+ if __name__ == "__main__":
15
+
16
+ embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
17
+
18
+ folder_path = Path('..') / "vectorstore/faiss-insurance-agent-500"
19
+
20
+ print(f'{Path(folder_path).exists() = }')
21
+
22
+ faissdb = FAISS.load_local(folder_path=str(folder_path.resolve()),
23
+ embeddings=embeddings,
24
+ allow_dangerous_deserialization=True)
25
+
26
+ documents = faissdb.get(list(range(5)))
27
+
28
+ for doc in documents:
29
+ print(f"Metadata: {doc.metadata}")