That1BrainCell commited on
Commit
4663454
·
verified ·
1 Parent(s): b4c5a56

Delete embedding.py

Browse files
Files changed (1) hide show
  1. embedding.py +0 -370
embedding.py DELETED
@@ -1,370 +0,0 @@
1
- from PyPDF2 import PdfReader
2
- import requests
3
- import json
4
- import os
5
- import concurrent.futures
6
- import random
7
- from langchain_google_genai import ChatGoogleGenerativeAI
8
- from langchain_community.document_loaders import WebBaseLoader
9
- from langchain_community.document_loaders import PyPDFLoader
10
- from langchain.text_splitter import RecursiveCharacterTextSplitter
11
- import google.generativeai as genai
12
- from langchain_core.messages import HumanMessage
13
- from io import BytesIO
14
- import numpy as np
15
- import re
16
- import torch
17
- from transformers import AutoTokenizer, AutoModel
18
-
19
- from search import search_images
20
-
21
- gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
22
- gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
23
- gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
24
- gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
25
-
26
- vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
27
- vision1 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
28
- vision2 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
29
- vision3 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
30
-
31
- tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-base-en-v1.5',trust_remote_code = True)
32
- model = AutoModel.from_pretrained('Alibaba-NLP/gte-base-en-v1.5',trust_remote_code = True)
33
- model.to('cpu') # Ensure the model is on the CPU
34
-
35
-
36
- genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
37
-
38
- def pdf_extractor(link):
39
- text = ''
40
-
41
- try:
42
- # Fetch the PDF file from the URL
43
- response = requests.get(link)
44
- response.raise_for_status() # Raise an error for bad status codes
45
-
46
- # Use BytesIO to handle the PDF content in memory
47
- pdf_file = BytesIO(response.content)
48
-
49
- # Load the PDF file
50
- reader = PdfReader(pdf_file)
51
- for page in reader.pages:
52
- text += page.extract_text() # Extract text from each page
53
-
54
- except requests.exceptions.HTTPError as e:
55
- print(f'HTTP error occurred: {e}')
56
- except Exception as e:
57
- print(f'An error occurred: {e}')
58
-
59
- return text
60
-
61
- def web_extractor(link):
62
- text = ''
63
-
64
- try:
65
- loader = WebBaseLoader(link)
66
- pages = loader.load_and_split()
67
-
68
- for page in pages:
69
- text+=page.page_content
70
- except:
71
- pass
72
-
73
- return text
74
-
75
- def imporve_text(text):
76
-
77
- prompt = f'''
78
- Please rewrite the following text to make it short, concise, and of high quality.
79
- Ensure that all essential information and key points are retained.
80
- Focus on improving clarity, coherence, and word choice without altering the original meaning.
81
-
82
- text = {text}
83
- '''
84
-
85
- model = random.choice([gemini,gemini1,gemini2,gemini3])
86
- result = model.invoke(prompt)
87
-
88
- return result.content
89
-
90
- def feature_extraction(tag, history , context):
91
-
92
- prompt = f'''
93
- You are an intelligent assistant tasked with updating product information. You have two data sources:
94
- 1. Tag_History: Previously gathered information about the product.
95
- 2. Tag_Context: New data that might contain additional details.
96
- Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
97
- Guidelines:
98
- - Only add new details that are relevant to the {tag} FIELD.
99
- - Do not add or modify any other fields in the Tag_History.
100
- - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
101
- Here is the data:
102
- Tag_Context: {str(context)}
103
- Tag_History: {history}
104
- Respond with the updated Tag_History.
105
- '''
106
-
107
- model = random.choice([gemini,gemini1,gemini2,gemini3])
108
- result = model.invoke(prompt)
109
-
110
- return result.content
111
-
112
- def feature_extraction_image(url):
113
- text = ' '
114
- model = genai.GenerativeModel('gemini-1.5-flash-001')
115
- try:
116
- res = model.generate_content(['Describe this image to me',url])
117
- text = res.text
118
-
119
- except:
120
- pass
121
- return text
122
-
123
- def detailed_feature_extraction(find, context):
124
-
125
- prompt = f'''
126
- You are an intelligent assistant tasked with finding product information. You have one data source and one output format:
127
- 1. Context: The gathered information about the product.
128
- 2. Format: Details which need to be filled based on Context.
129
- Your job is to read the Context and update the relevant field in Format using Context.
130
- Guidelines:
131
- - Only add details that are relevant to the individual FIELD.
132
- - Do not add or modify any other fields in the Format.
133
- - If nothing found return None.
134
- Here is the data:
135
- The Context is {str(context)}
136
- The Format is {str(find)}
137
- '''
138
-
139
- model = random.choice([gemini,gemini1,gemini2,gemini3])
140
- result = model.invoke(prompt)
141
-
142
- return result.content
143
-
144
- def detailed_history(history):
145
-
146
- details = {
147
- "Introduction": {
148
- "Product Name": None,
149
- "Overview of the product": None,
150
- "Purpose of the manual": None,
151
- "Audience": None,
152
- "Additional Details": None
153
- },
154
- "Specifications": {
155
- "Technical specifications": None,
156
- "Performance metrics": None,
157
- "Additional Details": None
158
- },
159
- "Product Overview": {
160
- "Product features": None,
161
- "Key components and parts": None,
162
- "Additional Details": None
163
- },
164
- "Safety Information": {
165
- "Safety warnings and precautions": None,
166
- "Compliance and certification information": None,
167
- "Additional Details": None
168
- },
169
- "Installation Instructions": {
170
- "Unboxing and inventory checklist": None,
171
- "Step-by-step installation guide": None,
172
- "Required tools and materials": None,
173
- "Additional Details": None
174
- },
175
- "Setup and Configuration": {
176
- "Initial setup procedures": None,
177
- "Configuration settings": None,
178
- "Troubleshooting setup issues": None,
179
- "Additional Details": None
180
- },
181
- "Operation Instructions": {
182
- "How to use the product": None,
183
- "Detailed instructions for different functionalities": None,
184
- "User interface guide": None,
185
- "Additional Details": None
186
- },
187
- "Maintenance and Care": {
188
- "Cleaning instructions": None,
189
- "Maintenance schedule": None,
190
- "Replacement parts and accessories": None,
191
- "Additional Details": None
192
- },
193
- "Troubleshooting": {
194
- "Common issues and solutions": None,
195
- "Error messages and their meanings": None,
196
- "Support Information": None,
197
- "Additional Details": None
198
- },
199
- "Warranty Information": {
200
- "Terms and Conditions": None,
201
- "Service and repair information": None,
202
- "Additional Details": None
203
- },
204
- "Legal Information": {
205
- "Copyright information": None,
206
- "Trademarks and patents": None,
207
- "Disclaimers": None,
208
- "Additional Details": None
209
-
210
- }
211
- }
212
-
213
- for key,val in history.items():
214
-
215
- find = details[key]
216
-
217
- details[key] = str(detailed_feature_extraction(find,val))
218
-
219
- return details
220
-
221
-
222
- def get_embeddings(link,tag_option):
223
-
224
- print(f"\n--> Creating Embeddings - {link}")
225
-
226
- if tag_option=='Complete Document Similarity':
227
- history = { "Details": "" }
228
-
229
- else:
230
- history = {
231
- "Introduction": "",
232
- "Specifications": "",
233
- "Product Overview": "",
234
- "Safety Information": "",
235
- "Installation Instructions": "",
236
- "Setup and Configuration": "",
237
- "Operation Instructions": "",
238
- "Maintenance and Care": "",
239
- "Troubleshooting": "",
240
- "Warranty Information": "",
241
- "Legal Information": ""
242
- }
243
-
244
- # Extract Text -----------------------------
245
- print("Extracting Text")
246
- if link[-3:] == '.md' or link[8:11] == 'en.':
247
- text = web_extractor(link)
248
- else:
249
- text = pdf_extractor(link)
250
-
251
- # Create Chunks ----------------------------
252
- print("Writing Tag Data")
253
-
254
- if tag_option=="Complete Document Similarity":
255
- history["Details"] = feature_extraction("Details", history["Details"], text[0][:50000])
256
-
257
- else:
258
- chunks = text_splitter.create_documents(text)
259
-
260
- for chunk in chunks:
261
-
262
- with concurrent.futures.ThreadPoolExecutor() as executor:
263
- future_to_key = {
264
- executor.submit(
265
- feature_extraction, f"Product {key}", history[key], chunk.page_content
266
- ): key for key in history
267
- }
268
- for future in concurrent.futures.as_completed(future_to_key):
269
- key = future_to_key[future]
270
- try:
271
- response = future.result()
272
- history[key] = response
273
- except Exception as e:
274
- print(f"Error processing {key}: {e}")
275
-
276
- print("Creating Vectors")
277
- genai_embeddings=[]
278
-
279
- for tag in history:
280
- result = genai.embed_content(
281
- model="models/embedding-001",
282
- content=history[tag],
283
- task_type="retrieval_document")
284
- genai_embeddings.append(result['embedding'])
285
-
286
-
287
- return history,genai_embeddings
288
-
289
- def get_embed_chroma(link):
290
-
291
- print(f"\n--> Creating Embeddings - {link}")
292
-
293
- # Extract Text -----------------------------
294
- if link[-3:] == '.md' or link[8:11] == 'en.':
295
- text = web_extractor(link)
296
- else:
297
- text = pdf_extractor(link)
298
- print("\u2713 Extracting Text")
299
-
300
- # Create Chunks ----------------------------
301
-
302
- text = re.sub(r'\.{2,}', '.', text)
303
- text = re.sub(r'\s{2,}', ' ', text)
304
- text = [re.sub(r'\n{2,}', '\n', text)]
305
-
306
- chunks = text_splitter_small.create_documents(text)
307
- print("\u2713 Writing Tag Data")
308
-
309
- # Creating Vector
310
- embedding_vectors=[]
311
- textual_data = []
312
- print("\u2713 Creating Vectors")
313
-
314
-
315
- for text in chunks:
316
-
317
- inputs = tokenizer(text.page_content, return_tensors="pt", padding=True, truncation=True)
318
- inputs = {k: v.to('cpu') for k, v in inputs.items()}
319
-
320
- # Get the model's outputs
321
- with torch.no_grad():
322
- outputs = model(**inputs)
323
-
324
- embeddings = outputs.last_hidden_state.mean(dim=1)
325
- embedding_vectors.append(embeddings.squeeze().cpu().numpy().tolist())
326
- textual_data.append(text.page_content)
327
-
328
- return textual_data , embedding_vectors
329
-
330
-
331
-
332
- def get_image_embeddings(Product):
333
- image_embeddings = []
334
-
335
- links = search_images(Product)
336
- with concurrent.futures.ThreadPoolExecutor() as executor:
337
- descriptions = list(executor.map(feature_extraction_image, links))
338
-
339
- for description in descriptions:
340
- result = genai.embed_content(
341
- model="models/embedding-001",
342
- content=description,
343
- task_type="retrieval_document")
344
-
345
- image_embeddings.append(result['embedding'])
346
- # print(image_embeddings)
347
- return image_embeddings
348
-
349
-
350
-
351
- global text_splitter
352
- global data
353
- global history
354
-
355
-
356
- text_splitter = RecursiveCharacterTextSplitter(
357
- chunk_size = 10000,
358
- chunk_overlap = 100,
359
- separators = ["",''," "]
360
- )
361
-
362
- text_splitter_small = RecursiveCharacterTextSplitter(
363
- chunk_size = 2000,
364
- chunk_overlap = 100,
365
- separators = ["",''," "]
366
- )
367
-
368
- if __name__ == '__main__':
369
- print(get_embed_chroma('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf'))
370
- # print(get_image_embeddings(Product='Samsung Galaxy S24'))