ravi259 commited on
Commit
4341309
·
1 Parent(s): 2cff746

smaller faiss db

Browse files
Files changed (3) hide show
  1. app.py +4 -4
  2. requirements-backup.txt +95 -0
  3. vector_loader.py +37 -234
app.py CHANGED
@@ -35,7 +35,7 @@ OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
35
 
36
  def load_knowledgeBase():
37
  embeddings=OpenAIEmbeddings(api_key=OPENAI_API_KEY)
38
- DB_FAISS_PATH = "vectorstore/db_faiss/"
39
  db = FAISS.load_local(
40
  DB_FAISS_PATH,
41
  embeddings,
@@ -45,10 +45,10 @@ def load_knowledgeBase():
45
  return db
46
  def load_prompt():
47
  prompt = """ You are helping students to pass NJMVC Knowledge Test. Provide a Single multiple choice question with 4 options to choose from.
48
- Use the information from context ONLY to provide the question and answer choices.
49
  context = {context}
50
  question = {question}
51
- if the answer is not in the pdf answer "i donot know what the hell you are asking about"
52
  """
53
  prompt = ChatPromptTemplate.from_template(prompt)
54
  return prompt
@@ -128,7 +128,7 @@ def main():
128
  text_chunks = get_chunk_text(question)
129
 
130
  db = FAISS.load_local(folder_path="./vectorstore/db_faiss/",embeddings=OpenAIEmbeddings(api_key=OPENAI_API_KEY),allow_dangerous_deserialization=True, index_name="njmvc_Index")
131
- searchDocs = db.similarity_search("what is the NJMVC driving test")
132
 
133
  similar_embeddings=FAISS.from_documents(documents=searchDocs, embedding=OpenAIEmbeddings(api_key=OPENAI_API_KEY))
134
  #creating the chain for integrating llm,prompt,stroutputparser
 
35
 
36
  def load_knowledgeBase():
37
  embeddings=OpenAIEmbeddings(api_key=OPENAI_API_KEY)
38
+ DB_FAISS_PATH = "./vectorstore/db_faiss/"
39
  db = FAISS.load_local(
40
  DB_FAISS_PATH,
41
  embeddings,
 
45
  return db
46
  def load_prompt():
47
  prompt = """ You are helping students to pass NJMVC Knowledge Test. Provide a Single multiple choice question with 4 options to choose from.
48
+ Use the information from context provided below to provide the question and answer choices.
49
  context = {context}
50
  question = {question}
51
+ if the context is not available, say I cannot give Question"
52
  """
53
  prompt = ChatPromptTemplate.from_template(prompt)
54
  return prompt
 
128
  text_chunks = get_chunk_text(question)
129
 
130
  db = FAISS.load_local(folder_path="./vectorstore/db_faiss/",embeddings=OpenAIEmbeddings(api_key=OPENAI_API_KEY),allow_dangerous_deserialization=True, index_name="njmvc_Index")
131
+ searchDocs = db.similarity_search(question)
132
 
133
  similar_embeddings=FAISS.from_documents(documents=searchDocs, embedding=OpenAIEmbeddings(api_key=OPENAI_API_KEY))
134
  #creating the chain for integrating llm,prompt,stroutputparser
requirements-backup.txt ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.3
2
+ aiosignal==1.3.1
3
+ altair==5.2.0
4
+ annotated-types==0.6.0
5
+ anyio==4.3.0
6
+ async-timeout==4.0.3
7
+ attrs==23.2.0
8
+ blinker==1.7.0
9
+ cachetools==5.3.3
10
+ certifi==2024.2.2
11
+ cffi==1.16.0
12
+ charset-normalizer==3.3.2
13
+ click==8.1.7
14
+ cryptography==42.0.5
15
+ dataclasses-json==0.6.4
16
+ datasets==2.18.0
17
+ dill==0.3.8
18
+ distro==1.9.0
19
+ exceptiongroup==1.2.0
20
+ faiss-cpu==1.8.0
21
+ filelock==3.13.1
22
+ frozenlist==1.4.1
23
+ fsspec==2024.2.0
24
+ gitdb==4.0.11
25
+ GitPython==3.1.42
26
+ greenlet==3.0.3
27
+ h11==0.14.0
28
+ httpcore==1.0.4
29
+ httpx==0.27.0
30
+ huggingface-hub==0.21.4
31
+ idna==3.6
32
+ Jinja2==3.1.3
33
+ jsonpatch==1.33
34
+ jsonpointer==2.4
35
+ jsonschema==4.21.1
36
+ jsonschema-specifications==2023.12.1
37
+ langchain==0.1.13
38
+ langchain-community==0.0.29
39
+ langchain-core==0.1.33
40
+ langchain-openai==0.1.1
41
+ langchain-text-splitters==0.0.1
42
+ langsmith==0.1.31
43
+ markdown-it-py==3.0.0
44
+ MarkupSafe==2.1.5
45
+ marshmallow==3.21.1
46
+ mdurl==0.1.2
47
+ multidict==6.0.5
48
+ multiprocess==0.70.16
49
+ mypy-extensions==1.0.0
50
+ numpy==1.26.4
51
+ openai==1.14.2
52
+ orjson==3.9.15
53
+ packaging==23.2
54
+ pandas==2.2.1
55
+ pdf2image==1.17.0
56
+ pdfminer.six==20231228
57
+ pdfplumber==0.11.0
58
+ pillow==10.2.0
59
+ protobuf==4.25.3
60
+ pyarrow==15.0.2
61
+ pyarrow-hotfix==0.6
62
+ pycparser==2.21
63
+ pydantic==2.6.4
64
+ pydantic_core==2.16.3
65
+ pydeck==0.8.1b0
66
+ Pygments==2.17.2
67
+ PyPDF2==3.0.1
68
+ pypdfium2==4.28.0
69
+ pytesseract==0.3.10
70
+ python-dateutil==2.9.0.post0
71
+ python-dotenv==1.0.1
72
+ pytz==2024.1
73
+ PyYAML==6.0.1
74
+ referencing==0.34.0
75
+ regex==2023.12.25
76
+ requests==2.31.0
77
+ rich==13.7.1
78
+ rpds-py==0.18.0
79
+ six==1.16.0
80
+ smmap==5.0.1
81
+ sniffio==1.3.1
82
+ SQLAlchemy==2.0.28
83
+ streamlit==1.32.2
84
+ tenacity==8.2.3
85
+ tiktoken==0.6.0
86
+ toml==0.10.2
87
+ toolz==0.12.1
88
+ tornado==6.4
89
+ tqdm==4.66.2
90
+ typing-inspect==0.9.0
91
+ typing_extensions==4.10.0
92
+ tzdata==2024.1
93
+ urllib3==2.2.1
94
+ xxhash==3.4.1
95
+ yarl==1.9.4
vector_loader.py CHANGED
@@ -1,7 +1,10 @@
1
  from langchain_community.document_loaders import PyPDFLoader
2
- from langchain.text_splitter import RecursiveCharacterTextSplitter
3
  from langchain_community.vectorstores import FAISS
 
4
  from langchain_openai import OpenAIEmbeddings
 
 
5
  import PyPDF2
6
  from PyPDF2 import PdfReader
7
  import pdfplumber
@@ -18,248 +21,48 @@ from dotenv import load_dotenv
18
  load_dotenv()
19
  OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
20
 
21
- # Extracting tables from the page
22
- def extract_table(pdf_path, page_num, table_num):
23
- # Open the pdf file
24
- pdf = pdfplumber.open(pdf_path)
25
- # Find the examined page
26
- table_page = pdf.pages[page_num]
27
- # Extract the appropriate table
28
- table = table_page.extract_tables()[table_num]
29
-
30
- return table
31
-
32
- # Convert table into appropriate fromat
33
- def table_converter(table):
34
- table_string = ''
35
- # Iterate through each row of the table
36
- for row_num in range(len(table)):
37
- row = table[row_num]
38
- # Remove the line breaker from the wrapted texts
39
- cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
40
- # Convert the table into a string
41
- table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
42
- # Removing the last line break
43
- table_string = table_string[:-1]
44
- return table_string
45
-
46
-
47
- # Create a function to check if the element is in any tables present in the page
48
- def is_element_inside_any_table(element, page ,tables):
49
- x0, y0up, x1, y1up = element.bbox
50
- # Change the cordinates because the pdfminer counts from the botton to top of the page
51
- y0 = page.bbox[3] - y1up
52
- y1 = page.bbox[3] - y0up
53
- for table in tables:
54
- tx0, ty0, tx1, ty1 = table.bbox
55
- if tx0 <= x0 <= x1 <= tx1 and ty0 <= y0 <= y1 <= ty1:
56
- return True
57
- return False
58
-
59
- # Function to find the table for a given element
60
- def find_table_for_element(element, page ,tables):
61
- x0, y0up, x1, y1up = element.bbox
62
- # Change the cordinates because the pdfminer counts from the botton to top of the page
63
- y0 = page.bbox[3] - y1up
64
- y1 = page.bbox[3] - y0up
65
- for i, table in enumerate(tables):
66
- tx0, ty0, tx1, ty1 = table.bbox
67
- if tx0 <= x0 <= x1 <= tx1 and ty0 <= y0 <= y1 <= ty1:
68
- return i # Return the index of the table
69
- return None
70
-
71
-
72
- def text_extraction(element):
73
- # Extracting the text from the in line text element
74
- line_text = element.get_text()
75
-
76
- # Find the formats of the text
77
- # Initialize the list with all the formats appeared in the line of text
78
- line_formats = []
79
- for text_line in element:
80
- if isinstance(text_line, LTTextContainer):
81
- # Iterating through each character in the line of text
82
- for character in text_line:
83
- if isinstance(character, LTChar):
84
- # Append the font name of the character
85
- #line_formats.append(character.fontname)
86
- # Append the font size of the character
87
- #line_formats.append(character.size)
88
- line_formats.append("")
89
-
90
- # Find the unique font sizes and names in the line
91
- format_per_line = list(set(line_formats))
92
-
93
- # Return a tuple with the text in each line along with its format
94
- return (line_text, format_per_line)
95
-
96
-
97
- # Create a function to crop the image elements from PDFs
98
- def crop_image(element, pageObj):
99
- # Get the coordinates to crop the image from PDF
100
- [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
101
- # Crop the page using coordinates (left, bottom, right, top)
102
- pageObj.mediabox.lower_left = (image_left, image_bottom)
103
- pageObj.mediabox.upper_right = (image_right, image_top)
104
- # Save the cropped page to a new PDF
105
- cropped_pdf_writer = PyPDF2.PdfWriter()
106
- cropped_pdf_writer.add_page(pageObj)
107
- # Save the cropped PDF to a new file
108
- with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
109
- cropped_pdf_writer.write(cropped_pdf_file)
110
-
111
- # Create a function to convert the PDF to images
112
- def convert_to_images(input_file,):
113
- images = convert_from_path(input_file)
114
- image = images[0]
115
- output_file = 'PDF_image.png'
116
- image.save(output_file, 'PNG')
117
-
118
- # Create a function to read text from images
119
- def image_to_text(image_path):
120
- # Read the image
121
- img = Image.open(image_path)
122
- # Extract the text from the image
123
- text = pytesseract.image_to_string(img)
124
- return text
125
-
126
-
127
-
128
- def read_file_get_prompts(file_name):
129
- if file_name is not None:
130
-
131
- # Find the PDF path
132
- pdf_path = file_name # '/content/data/'+file_name+".pdf"
133
- pdfReaded = PyPDF2.PdfReader(file_name)
134
-
135
- # Create the dictionary to extract text from each image
136
- text_per_page = {}
137
- # Create a boolean variable for image detection
138
- image_flag = False
139
-
140
- number_of_pages = len(list(extract_pages(file_name)))
141
- result = ''
142
-
143
- # We extract the pages from the PDF
144
- for pagenum, page in enumerate(extract_pages(file_name)):
145
-
146
- # Initialize the variables needed for the text extraction from the page
147
- pageObj = pdfReaded.pages[pagenum]
148
- page_text = []
149
- line_format = []
150
- text_from_images = []
151
- text_from_tables = []
152
- page_content = []
153
- # Initialize the number of the examined tables
154
- table_in_page= -1
155
- # Open the pdf file
156
- pdf = pdfplumber.open(pdf_path)
157
- # Find the examined page
158
- page_tables = pdf.pages[pagenum]
159
- # Find the number of tables in the page
160
- tables = page_tables.find_tables()
161
- if len(tables)!=0:
162
- table_in_page = 0
163
-
164
- # Extracting the tables of the page
165
- for table_num in range(len(tables)):
166
- # Extract the information of the table
167
- table = extract_table(pdf_path, pagenum, table_num)
168
- # Convert the table information in structured string format
169
- table_string = table_converter(table)
170
- # Append the table string into a list
171
- text_from_tables.append(table_string)
172
-
173
- # Find all the elements
174
- page_elements = [(element.y1, element) for element in page._objs]
175
- # Sort all the element as they appear in the page
176
- page_elements.sort(key=lambda a: a[0], reverse=True)
177
-
178
-
179
- # Find the elements that composed a page
180
- for i,component in enumerate(page_elements):
181
- # Extract the element of the page layout
182
- element = component[1]
183
-
184
- # Check the elements for tables
185
- if table_in_page == -1:
186
- pass
187
- else:
188
- if is_element_inside_any_table(element, page ,tables):
189
- table_found = find_table_for_element(element,page ,tables)
190
- if table_found == table_in_page and table_found != None:
191
- page_content.append(text_from_tables[table_in_page])
192
- #page_text.append('table')
193
- #line_format.append('table')
194
- table_in_page+=1
195
- # Pass this iteration because the content of this element was extracted from the tables
196
- continue
197
-
198
- if not is_element_inside_any_table(element,page,tables):
199
-
200
- # Check if the element is text element
201
- if isinstance(element, LTTextContainer):
202
- # Use the function to extract the text and format for each text element
203
- (line_text, format_per_line) = text_extraction(element)
204
- # Append the text of each line to the page text
205
- page_text.append(line_text)
206
- # Append the format for each line containing text
207
- line_format.append(format_per_line)
208
- page_content.append(line_text)
209
-
210
-
211
- # Check the elements for images
212
- if isinstance(element, LTFigure):
213
- # Crop the image from PDF
214
- crop_image(element, pageObj)
215
- # Convert the croped pdf to image
216
- convert_to_images('cropped_image.pdf')
217
- # Extract the text from image
218
- image_text = image_to_text('PDF_image.png')
219
- image_text = "" # removed to remove the errors with image
220
- text_from_images.append(image_text)
221
- page_content.append(image_text)
222
- # Add a placeholder in the text and format lists
223
- #page_text.append('image')
224
- #line_format.append('image')
225
- # Update the flag for image detection
226
- image_flag = True
227
-
228
-
229
- # Create the key of the dictionary
230
- dctkey = 'Page_'+str(pagenum)
231
- print(dctkey)
232
-
233
- # Add the list of list as value of the page key
234
- #text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
235
- text_per_page[dctkey]= [page_text, text_from_images,text_from_tables, page_content]
236
- #result = result.join(page_text).join(line_format).join(text_from_images).join(text_from_tables).join(page_content)
237
- result = " "
238
- for t in range(number_of_pages):
239
- page = 'Page_'+str(t)
240
- #result = result.join(map(str, text_per_page[page]))
241
- for q in range(len(text_per_page[page])):
242
- #print(f"{''.join(map(str, text_per_page[page][q]))}")
243
- result = result + f"{''.join(map(str, text_per_page[page][q]))}"
244
-
245
- return result
246
-
247
- return True
248
 
249
  def save_to_vector_store(text):
250
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
251
- docs = text_splitter.create_documents(text)
252
- vectorstore = FAISS.from_documents(documents=docs, embedding=OpenAIEmbeddings(api_key=OPENAI_API_KEY))
 
 
 
 
253
  vectorstore.save_local(DB_FAISS_PATH, index_name="njmvc_Index")
254
  #create a new file named vectorstore in your current directory.
255
  if __name__=="__main__":
256
- DB_FAISS_PATH = 'vectorstore/db_faiss'
257
  file_name = "./data/drivermanual-2-small.pdf"
258
  #loader=read_file_get_prompts(file_name)
259
- text=read_file_get_prompts(file_name)
 
260
  #pdfReaded = PyPDF2.PdfReader(file_name)
261
  #docs=loader.load()
262
- save_to_vector_store(text)
263
  #save_to_vector_store(text)
 
264
 
265
 
 
1
  from langchain_community.document_loaders import PyPDFLoader
2
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
3
  from langchain_community.vectorstores import FAISS
4
+ from langchain_community.document_loaders import TextLoader
5
  from langchain_openai import OpenAIEmbeddings
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings
7
+
8
  import PyPDF2
9
  from PyPDF2 import PdfReader
10
  import pdfplumber
 
21
  load_dotenv()
22
  OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
23
 
24
+ def extract_text_from_pdf(pdf_path):
25
+ # Open the PDF file
26
+ with open(pdf_path, 'rb') as pdf_file:
27
+ # Read the PDF file
28
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
29
+ # Get the number of pages in the PDF
30
+ num_pages = len(pdf_reader.pages)
31
+ # Initialize an empty string to store the text
32
+ full_text = ''
33
+ # Loop through each page and extract the text
34
+ for page_num in range(num_pages):
35
+ # Get the page object
36
+ #page = PyPDF2.PdfReader()
37
+ # Extract the text from the page
38
+ page_text = pdf_reader.pages[page_num].extract_text()
39
+ # Append the text to the full_text variable
40
+ full_text += page_text
41
+ # Return the full text of the PDF
42
+ return full_text
43
+
44
+ model = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
45
+ embeddings = HuggingFaceEmbeddings(model_name = model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  def save_to_vector_store(text):
48
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=300,
49
+ chunk_overlap=20,
50
+ length_function=len,
51
+ is_separator_regex=False)
52
+ docs = text_splitter.create_documents([text])
53
+ vectorstore = FAISS.from_documents(documents=docs, embedding=OpenAIEmbeddings(model="text-embedding-ada-002", api_key=OPENAI_API_KEY))
54
+ #vectorstore = FAISS.from_documents(documents=docs, embedding=embeddings)
55
  vectorstore.save_local(DB_FAISS_PATH, index_name="njmvc_Index")
56
  #create a new file named vectorstore in your current directory.
57
  if __name__=="__main__":
58
+ DB_FAISS_PATH = './vectorstore/db_faiss/'
59
  file_name = "./data/drivermanual-2-small.pdf"
60
  #loader=read_file_get_prompts(file_name)
61
+ #text=read_file_get_prompts(file_name)
62
+ text = extract_text_from_pdf(file_name)
63
  #pdfReaded = PyPDF2.PdfReader(file_name)
64
  #docs=loader.load()
 
65
  #save_to_vector_store(text)
66
+ save_to_vector_store(text)
67
 
68