sksameermujahid commited on
Commit
0a1a1bb
·
verified ·
1 Parent(s): 0c872bc

Upload 4 files

Browse files
Files changed (4) hide show
  1. .gitignore +29 -0
  2. Dockerfile +26 -0
  3. app.py +634 -0
  4. requirements.txt +16 -0
.gitignore ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.so
5
+ .Python
6
+ build/
7
+ develop-eggs/
8
+ dist/
9
+ downloads/
10
+ eggs/
11
+ .eggs/
12
+ lib/
13
+ lib64/
14
+ parts/
15
+ sdist/
16
+ var/
17
+ wheels/
18
+ *.egg-info/
19
+ .installed.cfg
20
+ *.egg
21
+ MANIFEST
22
+ .env
23
+ .venv
24
+ env/
25
+ venv/
26
+ ENV/
27
+ env.bak/
28
+ venv.bak/
29
+ .DS_Store
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /code
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ curl \
9
+ software-properties-common \
10
+ git \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Copy requirements first for better caching
14
+ COPY requirements.txt .
15
+
16
+ # Install Python dependencies
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+
19
+ # Copy the rest of the application
20
+ COPY . .
21
+
22
+ # Make port 7860 available
23
+ EXPOSE 7860
24
+
25
+ # Run the application
26
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install flask-cors
2
+ !pip install Flask pyngrok requests cloudinary SpeechRecognition pydub happytransformer transformers torch faiss-cpu sentence-transformers pandas unsloth bitsandbytes webrtcvad
3
+ !ngrok config add-authtoken 2nFD4jJkAN642UzGI86nDsSC4qs_2cDEGBUFVpbQ5KaDuu4ys
4
+ import os
5
+ import faiss
6
+ import torch
7
+ import pandas as pd
8
+ from sentence_transformers import SentenceTransformer
9
+ from flask import Flask, request, jsonify, render_template
10
+ from flask_cors import CORS
11
+ from pyngrok import ngrok
12
+ import requests
13
+ import cloudinary
14
+ import cloudinary.uploader
15
+ import cloudinary.api
16
+ from transformers import AutoTokenizer, AutoModelForCausalLM
17
+ import speech_recognition as sr
18
+ from pydub import AudioSegment
19
+ from happytransformer import HappyTextToText, TTSettings
20
+ import io
21
+ import logging
22
+ import geocoder
23
+ from geopy.distance import geodesic
24
+ import webrtcvad
25
+ import collections
26
+ import time
27
+ from werkzeug.utils import secure_filename
28
+ from geopy.geocoders import Nominatim
29
+ import pickle
30
+ import numpy as np
31
+
32
+ # Configure logging
33
+ logging.basicConfig(level=logging.INFO)
34
+
35
+ # Initialize Flask app
36
+ app = Flask(__name__, template_folder="templates")
37
+ CORS(app)
38
+
39
+ # Load environment variables
40
+ API_KEY = os.getenv("API_KEY", "default_key")
41
+ CSE_ID = os.getenv("CSE_ID", "default_cse")
42
+ CLOUDINARY_CLOUD_NAME = os.getenv("CLOUDINARY_CLOUD_NAME", "default_cloud")
43
+ CLOUDINARY_API_KEY = os.getenv("CLOUDINARY_API_KEY", "default_key")
44
+ CLOUDINARY_API_SECRET = os.getenv("CLOUDINARY_API_SECRET", "default_secret")
45
+
46
+ # Define paths for models and data
47
+ MODEL_PATH = os.path.join("models", "model_state_dict.pth")
48
+ FAISS_INDEX_PATH = os.path.join("models", "property_faiss.index")
49
+ DATASET_PATH = os.path.join("data", "property_data.csv")
50
+ MODEL_DIR = os.path.join("models", "llm_model")
51
+
52
+ # Check device
53
+ device = "cuda" if torch.cuda.is_available() else "cpu"
54
+ print(f"Using device: {device}")
55
+
56
+ # Initialize conversation context
57
+ conversation_context = {}
58
+
59
+ # Load SentenceTransformer model
60
+ def load_sentence_transformer():
61
+ print("Loading SentenceTransformer model...")
62
+ try:
63
+ model_embedding = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True).to(device)
64
+
65
+ # Load and optimize model state dict
66
+ state_dict = torch.load(MODEL_PATH, map_location=device)
67
+
68
+ # Dequantize if needed
69
+ for key, tensor in state_dict.items():
70
+ if hasattr(tensor, 'dequantize'): # Check if tensor is quantized
71
+ state_dict[key] = tensor.dequantize().to(dtype=torch.float32) # Convert to FP32
72
+ elif tensor.dtype == torch.bfloat16: # Handle bfloat16 tensors
73
+ state_dict[key] = tensor.to(dtype=torch.float32) # Convert to FP32
74
+
75
+ model_embedding.load_state_dict(state_dict)
76
+ print("SentenceTransformer model loaded successfully.")
77
+ return model_embedding
78
+ except Exception as e:
79
+ print(f"Error loading model: {str(e)}")
80
+ raise
81
+
82
+ # Load FAISS index
83
+ def load_faiss_index():
84
+ print("Loading FAISS index...")
85
+ index = faiss.read_index(FAISS_INDEX_PATH)
86
+ print("FAISS index loaded successfully.")
87
+ return index
88
+
89
+ # Load dataset
90
+ def load_dataset():
91
+ print("Loading dataset...")
92
+ df = pd.read_csv(DATASET_PATH)
93
+ print("Dataset loaded successfully.")
94
+ return df
95
+
96
+ # Custom Retriever Class
97
+ class CustomRagRetriever:
98
+ def __init__(self, faiss_index, model):
99
+ self.index = faiss_index
100
+ self.model = model
101
+ self.pca = None
102
+ # Load PCA if it exists
103
+ pca_path = os.path.join(os.path.dirname(MODEL_PATH), "pca_model.pkl")
104
+ if os.path.exists(pca_path):
105
+ with open(pca_path, 'rb') as f:
106
+ self.pca = pickle.load(f)
107
+
108
+ def retrieve(self, query, top_k=10):
109
+ print(f"Retrieving properties for query: {query}")
110
+ try:
111
+ # Get query embedding with optimizations
112
+ with torch.no_grad():
113
+ query_embedding = self.model.encode(
114
+ [query],
115
+ convert_to_numpy=True,
116
+ device=device,
117
+ normalize_embeddings=True
118
+ )
119
+ # Convert to FP16 after encoding
120
+ query_embedding = query_embedding.astype(np.float32)
121
+
122
+ if self.pca is not None:
123
+ query_embedding = self.pca.transform(query_embedding)
124
+
125
+ distances, indices = self.index.search(query_embedding, top_k)
126
+
127
+ retrieved_properties = []
128
+ for idx, dist in zip(indices[0], distances[0]):
129
+ property_data = df.iloc[idx]
130
+ retrieved_properties.append({
131
+ "property": property_data,
132
+ "image_url": property_data["property_image"],
133
+ "distance": float(dist)
134
+ })
135
+ print(f"Retrieved {len(retrieved_properties)} properties")
136
+ return retrieved_properties
137
+ except Exception as e:
138
+ print(f"Error in retrieve: {str(e)}")
139
+ raise
140
+
141
+ # Initialize components
142
+ df = load_dataset()
143
+ model_embedding = load_sentence_transformer()
144
+ index = load_faiss_index()
145
+ retriever = CustomRagRetriever(index, model_embedding)
146
+
147
+ # Load tokenizer and LLM model
148
+ def load_tokenizer_and_model():
149
+ print("Loading tokenizer...")
150
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
151
+ print("Tokenizer loaded successfully.")
152
+
153
+ print("Loading LLM model...")
154
+ model_llm = AutoModelForCausalLM.from_pretrained(MODEL_DIR).to(device)
155
+ print("LLM model loaded successfully.")
156
+ return tokenizer, model_llm
157
+
158
+ tokenizer, model_llm = load_tokenizer_and_model()
159
+
160
+ # Configure Cloudinary
161
+ def configure_cloudinary():
162
+ print("Configuring Cloudinary...")
163
+ cloudinary.config(
164
+ cloud_name=CLOUDINARY_CLOUD_NAME,
165
+ api_key=CLOUDINARY_API_KEY,
166
+ api_secret=CLOUDINARY_API_SECRET
167
+ )
168
+ print("Cloudinary configured successfully.")
169
+
170
+ configure_cloudinary()
171
+
172
+ # Search real estate properties
173
+ def search_real_estate(query, retriever, top_k=10, raw_results=False):
174
+ print(f"Searching real estate properties for query: {query}")
175
+ search_results = retriever.retrieve(query, top_k)
176
+
177
+ if raw_results:
178
+ return search_results
179
+
180
+ formatted_results = []
181
+ for result in search_results:
182
+ property_info = result['property']
183
+ formatted_result = {
184
+ "Property Name": property_info.get('PropertyName', 'N/A'),
185
+ "Address": property_info.get('Address', 'N/A'),
186
+ "ZipCode": int(float(property_info.get('ZipCode', 0))),
187
+ "LeasableSquareFeet": int(float(property_info.get('LeasableSquareFeet', 0))),
188
+ "YearBuilt": int(float(property_info.get('YearBuilt', 0))),
189
+ "NumberOfRooms": int(float(property_info.get('NumberOfRooms', 0))),
190
+ "ParkingSpaces": int(float(property_info.get('ParkingSpaces', 0))),
191
+ "PropertyManager": property_info.get('PropertyManager', 'N/A'),
192
+ "MarketValue": float(property_info.get('MarketValue', 0)),
193
+ "TaxAssessmentNumber": property_info.get('TaxAssessmentNumber', 'N/A'),
194
+ "Latitude": float(property_info.get('Latitude', 0)),
195
+ "Longitude": float(property_info.get('Longitude', 0)),
196
+ "CreateDate": property_info.get('CreateDate', 'N/A'),
197
+ "LastModifiedDate": property_info.get('LastModifiedDate', 'N/A'),
198
+ "City": property_info.get('City', 'N/A'),
199
+ "State": property_info.get('State', 'N/A'),
200
+ "Country": property_info.get('Country', 'N/A'),
201
+ "PropertyType": property_info.get('PropertyType', 'N/A'),
202
+ "PropertyStatus": property_info.get('PropertyStatus', 'N/A'),
203
+ "Description": property_info.get('Description', 'N/A'),
204
+ "ViewNumber": int(float(property_info.get('ViewNumber', 0))),
205
+ "Contact": int(float(property_info.get('Contact', 0))),
206
+ "TotalSquareFeet": int(float(property_info.get('TotalSquareFeet', 0))),
207
+ "IsDeleted": bool(property_info.get('IsDeleted', False)),
208
+ "Beds": int(float(property_info.get('Beds', 0))),
209
+ "Baths": int(float(property_info.get('Baths', 0))),
210
+ "AgentName": property_info.get('AgentName', 'N/A'),
211
+ "AgentPhoneNumber": property_info.get('AgentPhoneNumber', 'N/A'),
212
+ "AgentEmail": property_info.get('AgentEmail', 'N/A'),
213
+ "KeyFeatures": property_info.get('KeyFeatures', 'N/A'),
214
+ "NearbyAmenities": property_info.get('NearbyAmenities', 'N/A'),
215
+ "Property Image": result['image_url'],
216
+ "Distance": result['distance']
217
+ }
218
+ formatted_results.append(formatted_result)
219
+
220
+ print(f"Found {len(formatted_results)} matching properties")
221
+ return formatted_results
222
+
223
+ # Generate response with optimized parameters
224
+ def generate_response(query, max_new_tokens=100, temperature=0.7, top_k=30, top_p=0.8, repetition_penalty=1.05):
225
+ print(f"\nGenerating response for query: {query}\n")
226
+
227
+ # Print parameter settings
228
+ print("Generation Parameters:")
229
+ print(f"- Max New Tokens: {max_new_tokens}")
230
+ print(f"- Temperature: {temperature}")
231
+ print(f"- Top-K Sampling: {top_k}")
232
+ print(f"- Top-P Sampling: {top_p}")
233
+ print(f"- Repetition Penalty: {repetition_penalty}")
234
+ print(f"- Sampling Enabled: True (do_sample=True)\n")
235
+
236
+ input_text = f"User: {query}\nAssistant:"
237
+ inputs = tokenizer(input_text, return_tensors="pt").to(device)
238
+
239
+ start_time = time.time() # Record start time
240
+
241
+ try:
242
+ outputs = model_llm.generate(
243
+ inputs.input_ids,
244
+ max_new_tokens=max_new_tokens,
245
+ temperature=temperature,
246
+ top_k=top_k,
247
+ top_p=top_p,
248
+ repetition_penalty=repetition_penalty,
249
+ do_sample=True,
250
+ eos_token_id=tokenizer.eos_token_id,
251
+ pad_token_id=tokenizer.pad_token_id
252
+ )
253
+
254
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
255
+ response = response.replace(input_text, "").strip()
256
+
257
+ end_time = time.time() # Record end time
258
+ duration = end_time - start_time # Calculate duration
259
+
260
+ print(f"\nGenerated Response:\n{response}\n")
261
+ print(f"Time taken to generate response: {duration:.2f} seconds\n")
262
+ return response, duration
263
+
264
+ except Exception as e:
265
+ logging.error(f"Error generating response: {e}")
266
+ return "An error occurred while generating the response.", None
267
+
268
+ # Combined model response with optimized parameters
269
+ def combined_model_response(query, retriever, top_k=5, max_new_tokens=512, temperature=0.5, top_k_sampling=30, repetition_penalty=1.0):
270
+ print(f"Generating combined model response for query: {query}")
271
+ retrieved_results = search_real_estate(query, retriever, top_k, raw_results=True)
272
+ if not retrieved_results:
273
+ return "No relevant properties found."
274
+ combined_property_details = []
275
+ for i, result in enumerate(retrieved_results, 1):
276
+ property_info = result['property']
277
+ property_details = (
278
+ f"Property {i}:\n"
279
+ f"Property Name: {property_info['PropertyName']}\n"
280
+ f"Address: {property_info['Address']}, {property_info['City']}, {property_info['State']}, {property_info['ZipCode']}, {property_info['Country']}\n"
281
+ f"Leasable Area: {property_info['LeasableSquareFeet']} sqft\n"
282
+ f"Year Built: {property_info['YearBuilt']}\n"
283
+ f"Beds: {property_info['Beds']} Baths: {property_info['Baths']}\n"
284
+ f"Parking Spaces: {property_info['ParkingSpaces']}\n"
285
+ f"Market Value: {property_info['MarketValue']}\n"
286
+ # f"Tax Assessment Number: {property_info['TaxAssessmentNumber']}\n"
287
+ # f"Coordinates: {property_info['Latitude']}, {property_info['Longitude']}\n"
288
+ f"Property Type: {property_info['PropertyType']}\n"
289
+ f"Property Status: {property_info['PropertyStatus']}\n"
290
+ f"Description: {property_info['Description']}\n"
291
+ # f"View Count: {property_info['ViewNumber']}\n"
292
+ f"Contact: {property_info['Contact']}\n"
293
+ f"Total Square Feet: {property_info['TotalSquareFeet']} sqft\n"
294
+ # f"Deleted: {'Yes' if property_info['IsDeleted'] else 'No'}\n"
295
+ f"Agent Name: {property_info['AgentName']}\n"
296
+ f"Agent Phone Number: {property_info['AgentPhoneNumber']}\n"
297
+ f"Agent Email: {property_info['AgentEmail']}\n"
298
+ f"Key Features: {property_info['KeyFeatures']}\n"
299
+ f"Nearby Amenities: {property_info['NearbyAmenities']}\n"
300
+ f"Created Date: {property_info['CreateDate']}\n"
301
+ f"Last Modified Date: {property_info['LastModifiedDate']}\n"
302
+ )
303
+ combined_property_details.append(property_details)
304
+ prompt = f"User Query: {query}\nProperty Details:\n" + "\n".join(combined_property_details) + "\nGenerate a concise response based on the user's query and retrieved property details."
305
+ print(f"User Query: {query}")
306
+ response, duration = generate_response(prompt, max_new_tokens=max_new_tokens)
307
+ print(f"Combined model response: {response}")
308
+ print(f"Time taken to generate combined model response: {duration:.2f} seconds\n")
309
+ return response, duration
310
+
311
+ # VAD Audio Class
312
+ class VADAudio:
313
+ def __init__(self, aggressiveness=3):
314
+ self.vad = webrtcvad.Vad(aggressiveness)
315
+ self.sample_rate = 16000
316
+ self.frame_duration_ms = 30
317
+
318
+ def frame_generator(self, audio, frame_duration_ms, sample_rate):
319
+ n = int(sample_rate * (frame_duration_ms / 1000.0))
320
+ offset = 0
321
+ while offset + n < len(audio):
322
+ yield audio[offset:offset + n]
323
+ offset += n
324
+
325
+ def vad_collector(self, audio, sample_rate, frame_duration_ms, padding_duration_ms=300, aggressiveness=3):
326
+ vad = webrtcvad.Vad(aggressiveness)
327
+ num_padding_frames = int(padding_duration_ms / frame_duration_ms)
328
+ ring_buffer = collections.deque(maxlen=num_padding_frames)
329
+ triggered = False
330
+
331
+ for frame in self.frame_generator(audio, frame_duration_ms, sample_rate):
332
+ is_speech = vad.is_speech(frame, sample_rate)
333
+ if not triggered:
334
+ ring_buffer.append((frame, is_speech))
335
+ num_voiced = len([f for f, speech in ring_buffer if speech])
336
+ if num_voiced > 0.9 * ring_buffer.maxlen:
337
+ triggered = True
338
+ for f, s in ring_buffer:
339
+ yield f
340
+ ring_buffer.clear()
341
+ else:
342
+ yield frame
343
+ ring_buffer.append((frame, is_speech))
344
+ num_unvoiced = len([f for f, speech in ring_buffer if not speech])
345
+ if num_unvoiced > 0.9 * ring_buffer.maxlen:
346
+ triggered = False
347
+ yield b''.join([f for f in ring_buffer])
348
+ ring_buffer.clear()
349
+
350
+ # Transcribe with VAD
351
+ def transcribe_with_vad(audio_file):
352
+ vad_audio = VADAudio()
353
+ audio = AudioSegment.from_file(audio_file)
354
+ audio = audio.set_frame_rate(vad_audio.sample_rate).set_channels(1)
355
+ raw_audio = audio.raw_data
356
+
357
+ frames = vad_audio.vad_collector(raw_audio, vad_audio.sample_rate, vad_audio.frame_duration_ms)
358
+ for frame in frames:
359
+ if len(frame) > 0:
360
+ recognizer = sr.Recognizer()
361
+ audio_data = sr.AudioData(frame, vad_audio.sample_rate, audio.sample_width)
362
+ try:
363
+ text = recognizer.recognize_google(audio_data)
364
+ print(f"Transcription: {text}")
365
+ return text
366
+ except sr.UnknownValueError:
367
+ print("Google Speech Recognition could not understand the audio")
368
+ except sr.RequestError as e:
369
+ print(f"Could not request results from Google Speech Recognition service; {e}")
370
+ return ""
371
+
372
+ @app.route('/')
373
+ def index():
374
+ return render_template('index.html')
375
+
376
+ @app.route('/search', methods=['POST'])
377
+ def search():
378
+ try:
379
+ data = request.json
380
+ query = data.get('query')
381
+ session_id = data.get('session_id')
382
+ continue_conversation = data.get('continue', False)
383
+
384
+ if not query:
385
+ return jsonify({"error": "Query parameter is missing"}), 400
386
+
387
+ if session_id not in conversation_context or not continue_conversation:
388
+ search_results = retriever.retrieve(query)
389
+ formatted_results = []
390
+
391
+ for result in search_results:
392
+ property_info = result['property']
393
+ formatted_result = {
394
+ "Property Name": property_info.get('PropertyName', 'N/A'),
395
+ "Address": property_info.get('Address', 'N/A'),
396
+ "ZipCode": int(float(property_info.get('ZipCode', 0))),
397
+ "LeasableSquareFeet": int(float(property_info.get('LeasableSquareFeet', 0))),
398
+ "YearBuilt": int(float(property_info.get('YearBuilt', 0))),
399
+ "NumberOfRooms": int(float(property_info.get('NumberOfRooms', 0))),
400
+ "ParkingSpaces": int(float(property_info.get('ParkingSpaces', 0))),
401
+ "PropertyManager": property_info.get('PropertyManager', 'N/A'),
402
+ "MarketValue": float(property_info.get('MarketValue', 0)),
403
+ "TaxAssessmentNumber": property_info.get('TaxAssessmentNumber', 'N/A'),
404
+ "City": property_info.get('City', 'N/A'),
405
+ "State": property_info.get('State', 'N/A'),
406
+ "Country": property_info.get('Country', 'N/A'),
407
+ "PropertyType": property_info.get('PropertyType', 'N/A'),
408
+ "PropertyStatus": property_info.get('PropertyStatus', 'N/A'),
409
+ "Description": property_info.get('Description', 'N/A'),
410
+ "ViewNumber": int(float(property_info.get('ViewNumber', 0))),
411
+ "Contact": int(float(property_info.get('Contact', 0))),
412
+ "TotalSquareFeet": int(float(property_info.get('TotalSquareFeet', 0))),
413
+ "IsDeleted": bool(property_info.get('IsDeleted', False)),
414
+ "Beds": int(float(property_info.get('Beds', 0))),
415
+ "Baths": int(float(property_info.get('Baths', 0))),
416
+ "AgentName": property_info.get('AgentName', 'N/A'),
417
+ "AgentPhoneNumber": property_info.get('AgentPhoneNumber', 'N/A'),
418
+ "AgentEmail": property_info.get('AgentEmail', 'N/A'),
419
+ "KeyFeatures": property_info.get('KeyFeatures', 'N/A'),
420
+ "NearbyAmenities": property_info.get('NearbyAmenities', 'N/A'),
421
+ "Property Image": result['image_url'],
422
+ "Distance": float(result['distance'])
423
+ }
424
+ formatted_results.append(formatted_result)
425
+
426
+ conversation_context[session_id] = formatted_results
427
+ else:
428
+ formatted_results = conversation_context[session_id]
429
+
430
+ print(f"Returning {len(formatted_results)} search results")
431
+ return jsonify(formatted_results)
432
+
433
+ except Exception as e:
434
+ logging.error(f"Error in search endpoint: {str(e)}")
435
+ return jsonify({"error": f"An error occurred: {str(e)}"}), 500
436
+
437
+ @app.route('/transcribe', methods=['POST'])
438
+ def transcribe():
439
+ if 'audio' not in request.files:
440
+ return jsonify({"error": "No audio file provided"}), 400
441
+
442
+ audio_file = request.files['audio']
443
+
444
+ # Ensure the file has an allowed extension
445
+ allowed_extensions = {'wav', 'mp3', 'ogg', 'webm'}
446
+ if '.' not in audio_file.filename or \
447
+ audio_file.filename.rsplit('.', 1)[1].lower() not in allowed_extensions:
448
+ return jsonify({"error": "Invalid audio file format"}), 400
449
+
450
+ try:
451
+ # Save the uploaded file temporarily
452
+ temp_dir = os.path.join(os.getcwd(), 'temp')
453
+ os.makedirs(temp_dir, exist_ok=True)
454
+ temp_path = os.path.join(temp_dir, 'temp_audio.' + audio_file.filename.rsplit('.', 1)[1].lower())
455
+
456
+ audio_file.save(temp_path)
457
+
458
+ # Convert audio to proper format if needed
459
+ audio = AudioSegment.from_file(temp_path)
460
+ audio = audio.set_channels(1) # Convert to mono
461
+ audio = audio.set_frame_rate(16000) # Set sample rate to 16kHz
462
+
463
+ # Save as WAV for speech recognition
464
+ wav_path = os.path.join(temp_dir, 'temp_audio.wav')
465
+ audio.export(wav_path, format="wav")
466
+
467
+ # Perform speech recognition
468
+ recognizer = sr.Recognizer()
469
+ with sr.AudioFile(wav_path) as source:
470
+ audio_data = recognizer.record(source)
471
+ text = recognizer.recognize_google(audio_data)
472
+
473
+ # Clean up temporary files
474
+ os.remove(temp_path)
475
+ os.remove(wav_path)
476
+
477
+ # Grammar correction
478
+ happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")
479
+ settings = TTSettings(do_sample=True, top_k=50, temperature=0.7)
480
+ corrected_text = happy_tt.generate_text(f"grammar: {text}", args=settings)
481
+
482
+ print(f"Original Transcription: {text}")
483
+ print(f"Corrected Transcription: {corrected_text.text}")
484
+
485
+ return jsonify({
486
+ "transcription": corrected_text.text,
487
+ "original": text
488
+ })
489
+
490
+ except sr.UnknownValueError:
491
+ return jsonify({"error": "Could not understand audio"}), 400
492
+ except sr.RequestError as e:
493
+ return jsonify({"error": f"Google Speech Recognition error: {str(e)}"}), 500
494
+ except Exception as e:
495
+ logging.error(f"Error processing audio: {str(e)}")
496
+ return jsonify({"error": f"Audio processing error: {str(e)}"}), 500
497
+ finally:
498
+ # Ensure temp files are cleaned up even if an error occurs
499
+ if 'temp_path' in locals() and os.path.exists(temp_path):
500
+ os.remove(temp_path)
501
+ if 'wav_path' in locals() and os.path.exists(wav_path):
502
+ os.remove(wav_path)
503
+
504
+ @app.route('/generate', methods=['POST'])
505
+ def generate():
506
+ data = request.json
507
+ query = data.get('query')
508
+ session_id = data.get('session_id')
509
+ continue_conversation = data.get('continue', False)
510
+ if not query:
511
+ return jsonify({"error": "Query parameter is missing"}), 400
512
+ if session_id in conversation_context and continue_conversation:
513
+ previous_results = conversation_context[session_id]
514
+ combined_query = f"Based on previous results:{previous_results}New Query: {query}"
515
+ response, duration = generate_response(combined_query)
516
+ else:
517
+ response, duration = generate_response(query)
518
+ conversation_context[session_id] = response
519
+ print(f"Generated response: {response}")
520
+ print(f"Time taken to generate response: {duration:.2f} seconds\n")
521
+ return jsonify({"response": response, "duration": duration})
522
+
523
+ @app.route('/recommend', methods=['POST'])
524
+ def recommend():
525
+ data = request.json
526
+ query = data.get('query')
527
+ session_id = data.get('session_id')
528
+ continue_conversation = data.get('continue', False)
529
+
530
+ if not query:
531
+ return jsonify({"error": "Query parameter is missing"}), 400
532
+
533
+ if query.lower() == 'hi':
534
+ return jsonify({"response": "Do you want to know the properties located near you? (yes/no):"})
535
+
536
+ if query.lower() == 'yes':
537
+ if session_id in conversation_context and 'location' in conversation_context[session_id]:
538
+ latitude, longitude = conversation_context[session_id]['location']
539
+ else:
540
+ return jsonify({"error": "Location not available. Please try again."}), 400
541
+
542
+ my_location = (latitude, longitude)
543
+
544
+ # Filter out rows with invalid coordinates before calculating distances
545
+ valid_properties = df[
546
+ df['Latitude'].apply(lambda x: isinstance(x, (int, float)) or (isinstance(x, str) and x.replace('.', '').isdigit())) &
547
+ df['Longitude'].apply(lambda x: isinstance(x, (int, float)) or (isinstance(x, str) and x.replace('.', '').isdigit()))
548
+ ].copy()
549
+
550
+ # Convert coordinates to float
551
+ valid_properties['Latitude'] = valid_properties['Latitude'].astype(float)
552
+ valid_properties['Longitude'] = valid_properties['Longitude'].astype(float)
553
+
554
+ # Calculate distances for valid properties
555
+ valid_properties['Distance'] = valid_properties.apply(
556
+ lambda row: geodesic(my_location, (row['Latitude'], row['Longitude'])).miles,
557
+ axis=1
558
+ )
559
+
560
+ # Get 5 nearest properties
561
+ nearest_properties = valid_properties.nsmallest(5, 'Distance')
562
+
563
+ nearest_properties_list = nearest_properties[[
564
+ 'PropertyName', 'Address', 'City', 'Distance',
565
+ 'PropertyType', 'AgentPhoneNumber'
566
+ ]].to_dict(orient='records')
567
+
568
+ if not nearest_properties_list:
569
+ return jsonify({"response": "No valid properties found near your location."})
570
+
571
+ return jsonify({
572
+ "response": "Here are the 5 nearest properties to your location:",
573
+ "properties": nearest_properties_list
574
+ })
575
+
576
+ if session_id in conversation_context and continue_conversation:
577
+ previous_results = conversation_context[session_id]
578
+ combined_query = f"Based on previous results:{previous_results}New Query: {query}"
579
+ response, duration = combined_model_response(combined_query, retriever)
580
+ else:
581
+ response, duration = combined_model_response(query, retriever)
582
+ conversation_context[session_id] = response
583
+
584
+ print(f"Recommended response: {response}")
585
+ print(f"Time taken to generate recommended response: {duration:.2f} seconds\n")
586
+ return jsonify({"response": response, "duration": duration})
587
+
588
+ @app.route('/set-location', methods=['POST'])
589
+ def set_location():
590
+ data = request.json
591
+ latitude = data.get('latitude')
592
+ longitude = data.get('longitude')
593
+ session_id = data.get('session_id')
594
+
595
+ if latitude is None or longitude is None:
596
+ return jsonify({"error": "Location parameters are missing"}), 400
597
+
598
+ try:
599
+ # Initialize the geolocator
600
+ geolocator = Nominatim(user_agent="hive_prop")
601
+
602
+ # Get location details from coordinates
603
+ location = geolocator.reverse(f"{latitude}, {longitude}", language='en')
604
+
605
+ if location and location.raw.get('address'):
606
+ address = location.raw['address']
607
+ city = address.get('city') or address.get('town') or address.get('suburb') or address.get('county')
608
+ state = address.get('state')
609
+ country = address.get('country')
610
+
611
+ # Store location data in conversation context
612
+ conversation_context[session_id] = {
613
+ 'location': (latitude, longitude),
614
+ 'city': city,
615
+ 'state': state,
616
+ 'country': country
617
+ }
618
+
619
+ return jsonify({
620
+ "message": "Location set successfully.",
621
+ "city": city,
622
+ "state": state,
623
+ "country": country
624
+ })
625
+ else:
626
+ return jsonify({"error": "Could not determine city from coordinates"}), 400
627
+
628
+ except Exception as e:
629
+ logging.error(f"Error getting location details: {str(e)}")
630
+ return jsonify({"error": f"Error processing location: {str(e)}"}), 500
631
+
632
+ if __name__ == '__main__':
633
+ # For Hugging Face Spaces, we need to listen on 0.0.0.0:7860
634
+ app.run(host='0.0.0.0', port=7860)
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ flask==2.0.1
2
+ flask-cors==3.0.10
3
+ torch==2.0.1
4
+ transformers==4.30.2
5
+ sentence-transformers==2.2.2
6
+ faiss-cpu==1.7.4
7
+ pandas==1.5.3
8
+ numpy==1.24.3
9
+ geopy==2.3.0
10
+ geocoder==1.38.1
11
+ cloudinary==1.33.0
12
+ pydub==0.25.1
13
+ SpeechRecognition==3.10.0
14
+ webrtcvad==2.0.10
15
+ happytransformer==2.4.1
16
+ Werkzeug==2.0.3