# models/location_analysis.py from .model_loader import load_model from geopy.geocoders import Nominatim from .logging_config import logger import re import time from typing import Dict, Any from geopy.distance import geodesic geocoder = Nominatim(user_agent="indian_property_verifier", timeout=10) def validate_address_format(address: str) -> bool: """Validate the format of the address.""" if not address: return False # Much more lenient minimum length if len(address.strip()) < 5: # Reduced from 10 to 5 return False # Much more lenient component check components = [comp.strip() for comp in address.split(',')] if len(components) < 1: # Reduced from 2 to 1 - just need some address return False # Much more lenient pattern matching patterns = [ r'[A-Za-z\s]+', # Should contain letters (most important) ] # Check if at least 1 pattern matches (reduced from 2) pattern_matches = sum(1 for pattern in patterns if re.search(pattern, address.lower())) if pattern_matches < 1: # Reduced from 2 to 1 return False # Much more lenient address component check address_lower = address.lower() has_location = any(term in address_lower for term in [ 'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater', 'street', 'road', 'avenue', 'lane', 'colony', 'society', 'area', 'near' ]) has_area = any(term in address_lower for term in [ 'colony', 'society', 'apartment', 'flat', 'house', 'plot', 'block', 'sector', 'area', 'locality', 'main', 'cross', 'circle', 'square', 'market', 'near' ]) # Much more lenient - return True if either condition is met or if address has reasonable length return has_location or has_area or len(address.strip()) >= 8 # Added length-based validation def validate_postal_code(postal_code: str) -> bool: """Validate Indian postal code format.""" if not postal_code: return False # Remove any spaces and convert to string postal_code = str(postal_code).strip().replace(' ', '') # Much more lenient format check if not re.match(r'^\d{5,6}$', postal_code): # Allow 5-6 digits instead of exactly 6 return False # Much more lenient first digit validation first_digit = int(postal_code[0]) if first_digit not in range(0, 10): # Allow 0-9 instead of 1-8 return False return True def validate_coordinates(latitude: str, longitude: str) -> bool: """Validate coordinate format and range for India.""" try: # Convert to float and handle any string formatting lat = float(str(latitude).strip()) lng = float(str(longitude).strip()) # Much more lenient India boundaries with larger buffer india_bounds = { 'lat_min': 5.0, # Reduced from 6.0 'lat_max': 40.0, # Increased from 38.0 'lng_min': 65.0, # Reduced from 67.0 'lng_max': 100.0 # Increased from 98.0 } # Check if coordinates are within India's boundaries if not (india_bounds['lat_min'] <= lat <= india_bounds['lat_max'] and india_bounds['lng_min'] <= lng <= india_bounds['lng_max']): return False # Much more lenient precision check lat_str = f"{lat:.4f}" # Reduced from 6 to 4 decimal places lng_str = f"{lng:.4f}" # Reduced from 6 to 4 decimal places # Much more lenient precision validation if abs(float(lat_str) - lat) > 0.0001 or abs(float(lng_str) - lng) > 0.0001: # Increased tolerance return False return True except (ValueError, TypeError): return False def verify_location_in_city(address: str, city: str) -> bool: """Verify if the address exists in the given city.""" if not address or not city: return False try: # Clean and normalize inputs address = address.strip() city = city.strip() # Extract key components from the address address_components = [comp.strip() for comp in address.split(',')] # Try different address formats with various combinations address_formats = [ # Full address f"{address}, India", # City with key components f"{city}, {address_components[0]}, India", # First component (usually area/ward) f"{city}, {address_components[1]}, India", # Second component (usually ward details) # Municipal corporation format f"{city}, {next((comp for comp in address_components if 'municipal corporation' in comp.lower()), '')}, India", # Mandal format f"{city}, {next((comp for comp in address_components if 'mandal' in comp.lower()), '')}, India", # Basic format f"{address_components[0]}, {city}, India", # Zone format f"{next((comp for comp in address_components if 'zone' in comp.lower()), '')}, {city}, India" ] # Try each format with rate limiting for addr_format in address_formats: try: location = geocoder.geocode(addr_format, timeout=10) if location: # Get the full address and normalize it location_address = location.address.lower() city_lower = city.lower() # Check for city name in different formats city_variations = [ city_lower, city_lower.replace(' ', ''), city_lower.replace(' ', '-'), f"{city_lower} city", f"{city_lower} district", f"{city_lower} municipal corporation", f"greater {city_lower}", f"greater {city_lower} municipal corporation" ] # Check if any city variation is in the address if any(var in location_address for var in city_variations): # Additional verification: check if the address components match location_components = [comp.strip().lower() for comp in location_address.split(',')] # Check for key components key_components = [ comp.lower() for comp in address_components if any(keyword in comp.lower() for keyword in [ 'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater' ]) ] # Check if at least 2 key components match matching_components = sum(1 for comp in key_components if any(comp in loc_comp for loc_comp in location_components)) if matching_components >= 2: return True except Exception as e: logger.debug(f"Error in address verification: {str(e)}") continue time.sleep(1) # Rate limiting # If direct verification fails, try reverse geocoding try: # Get city coordinates city_location = geocoder.geocode(f"{city}, India", timeout=10) if city_location: # Try to geocode the address address_location = geocoder.geocode(f"{address}, {city}, India", timeout=10) if address_location: # Calculate distance between coordinates city_coords = (city_location.latitude, city_location.longitude) address_coords = (address_location.latitude, address_location.longitude) distance = geodesic(city_coords, address_coords).kilometers # Use tier-based distance threshold city_lower = city.lower() metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"] tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore", "thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad", "ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"] if any(city in city_lower for city in metro_cities): max_distance = 50 # 50km for metro cities elif any(city in city_lower for city in tier2_cities): max_distance = 30 # 30km for tier 2 cities else: max_distance = 20 # 20km for other cities return distance <= max_distance except Exception as e: logger.debug(f"Error in reverse geocoding: {str(e)}") return False except Exception as e: logger.error(f"Error in location verification: {str(e)}") return False def verify_city_in_state(city: str, state: str) -> bool: """Verify if the city exists in the given state.""" if not city or not state: return False try: # Try different formats formats = [ f"{city}, {state}, India", f"{state}, {city}, India", f"{city}, {state}" ] for fmt in formats: try: location = geocoder.geocode(fmt, timeout=10) if location: location_address = location.address.lower() city_lower = city.lower() state_lower = state.lower() # Check for city and state names in different formats city_variations = [ city_lower, city_lower.replace(' ', ''), city_lower.replace(' ', '-') ] state_variations = [ state_lower, state_lower.replace(' ', ''), state_lower.replace(' ', '-') ] if any(city_var in location_address for city_var in city_variations) and \ any(state_var in location_address for state_var in state_variations): return True except: continue time.sleep(1) return False except: return False def verify_state_in_country(state: str, country: str = "India") -> bool: """Verify if the state exists in the given country.""" if not state: return False # List of valid Indian states and union territories valid_states = [ 'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chhattisgarh', 'goa', 'gujarat', 'haryana', 'himachal pradesh', 'jharkhand', 'karnataka', 'kerala', 'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', 'nagaland', 'odisha', 'punjab', 'rajasthan', 'sikkim', 'tamil nadu', 'telangana', 'tripura', 'uttar pradesh', 'uttarakhand', 'west bengal', 'andaman and nicobar islands', 'chandigarh', 'dadra and nagar haveli and daman and diu', 'delhi', 'jammu and kashmir', 'ladakh', 'lakshadweep', 'puducherry' ] state_lower = state.lower() return state_lower in valid_states def verify_postal_code_in_city(postal_code: str, city: str) -> bool: """Verify if the postal code belongs to the given city.""" if not postal_code or not city: return False try: # Try different formats formats = [ f"{postal_code}, {city}, India", f"{city}, {postal_code}, India", f"{postal_code}, {city}" ] for fmt in formats: try: location = geocoder.geocode(fmt, timeout=10) if location: location_address = location.address.lower() city_lower = city.lower() # Check for city name in different formats city_variations = [ city_lower, city_lower.replace(' ', ''), city_lower.replace(' ', '-') ] if any(var in location_address for var in city_variations): return True except: continue time.sleep(1) return False except: return False def verify_coordinates_in_city(latitude: str, longitude: str, city: str) -> bool: """Verify if the coordinates are within the given city.""" if not all([latitude, longitude, city]): return False try: # Convert to float and handle any string formatting lat = float(str(latitude).strip()) lng = float(str(longitude).strip()) # Get city coordinates city_location = geocoder.geocode(f"{city}, India", timeout=10) if not city_location: return False city_coords = (city_location.latitude, city_location.longitude) property_coords = (lat, lng) # Calculate distance between coordinates distance = geodesic(city_coords, property_coords).kilometers # Define maximum allowed distance based on city tier city_lower = city.lower() metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"] tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore", "thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad", "ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"] # Adjust max distance based on city tier if any(city in city_lower for city in metro_cities): max_distance = 50 # 50km for metro cities elif any(city in city_lower for city in tier2_cities): max_distance = 30 # 30km for tier 2 cities else: max_distance = 20 # 20km for other cities return distance <= max_distance except: return False def analyze_location(data: Dict[str, Any]) -> Dict[str, Any]: """Analyze location data with detailed verification.""" try: # Defensive: ensure data is a dict if not isinstance(data, dict): logger.warning(f"Input to analyze_location is not a dict: {type(data)}") data = {} # Defensive: ensure all expected keys exist for key in ['address', 'city', 'state', 'zip', 'latitude', 'longitude', 'nearby_landmarks']: if key not in data: data[key] = '' # Initialize verification results verification_results = { 'address_format_valid': validate_address_format(data.get('address', '')), 'address_in_city': verify_location_in_city(data.get('address', ''), data.get('city', '')), 'city_in_state': verify_city_in_state(data.get('city', ''), data.get('state', '')), 'state_in_country': verify_state_in_country(data.get('state', '')), 'postal_code_valid': validate_postal_code(data.get('zip', '')), 'postal_code_in_city': verify_postal_code_in_city(data.get('zip', ''), data.get('city', '')), 'coordinates_valid': validate_coordinates(data.get('latitude', ''), data.get('longitude', '')), 'coordinates_in_city': verify_coordinates_in_city( data.get('latitude', ''), data.get('longitude', ''), data.get('city', '') ) } # Calculate weighted completeness score with much more lenient weights weights = { 'address_format_valid': 0.10, # Reduced from 0.15 'address_in_city': 0.15, # Reduced from 0.20 'city_in_state': 0.15, # Increased from 0.10 'state_in_country': 0.15, # Increased from 0.10 'postal_code_valid': 0.15, # Increased from 0.10 'postal_code_in_city': 0.10, # Keep same 'coordinates_valid': 0.10, # Keep same 'coordinates_in_city': 0.10 # Reduced from 0.15 } completeness_score = sum( weights[key] * 100 if result else 0 for key, result in verification_results.items() ) # Much more lenient criteria for location quality critical_checks = ['city_in_state', 'state_in_country'] # Reduced critical checks secondary_checks = ['address_format_valid', 'address_in_city', 'postal_code_valid', 'postal_code_in_city', 'coordinates_valid', 'coordinates_in_city'] # Location is verified if critical checks pass and at least 1 secondary check passes critical_passed = all(verification_results[check] for check in critical_checks) secondary_passed = sum(1 for check in secondary_checks if verification_results[check]) location_quality = "verified" if critical_passed and secondary_passed >= 1 else "unverified" # Reduced from 2 to 1 # Analyze landmarks landmarks_analysis = { 'provided': bool(data.get('nearby_landmarks')), 'count': len(data.get('nearby_landmarks', '').split(',')) if data.get('nearby_landmarks') else 0, 'types': [] } if data.get('nearby_landmarks'): landmark_types = { 'transport': ['station', 'metro', 'bus', 'railway', 'airport', 'terminal', 'depot', 'stand', 'stop'], 'education': ['school', 'college', 'university', 'institute', 'academy', 'campus', 'library'], 'healthcare': ['hospital', 'clinic', 'medical', 'health', 'diagnostic', 'pharmacy', 'dispensary'], 'shopping': ['mall', 'market', 'shop', 'store', 'bazaar', 'complex', 'plaza', 'retail', 'outlet'], 'entertainment': ['park', 'garden', 'theater', 'cinema', 'stadium', 'auditorium', 'playground'], 'business': ['office', 'business', 'corporate', 'commercial', 'industrial', 'tech park', 'hub'] } landmarks = [landmark.strip() for landmark in data['nearby_landmarks'].lower().split(',')] for landmark in landmarks: for type_name, keywords in landmark_types.items(): if any(keyword in landmark for keyword in keywords): if type_name not in landmarks_analysis['types']: landmarks_analysis['types'].append(type_name) # Determine city tier city_tier = "unknown" if data.get('city'): city_lower = data['city'].lower() metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"] tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore", "thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad", "ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"] if any(city in city_lower for city in metro_cities): city_tier = "metro" elif any(city in city_lower for city in tier2_cities): city_tier = "tier2" else: city_tier = "tier3" # Much more lenient assessment criteria if completeness_score >= 60: # Reduced from 80 assessment = "complete" elif completeness_score >= 30: # Reduced from 50 assessment = "partial" else: assessment = "minimal" # Ensure minimum score for valid data if completeness_score == 0 and (data.get('city') or data.get('state')): completeness_score = 40 # Minimum 40% for having city/state return { **verification_results, 'assessment': assessment, 'completeness_score': completeness_score, 'location_quality': location_quality, 'city_tier': city_tier, 'landmarks_analysis': landmarks_analysis, 'verification_status': "verified" if location_quality == "verified" else "unverified", 'formatted_address': f"{data.get('address', '')}, {data.get('city', '')}, {data.get('state', '')}, India - {data.get('zip', '')}" } except Exception as e: logger.error(f"Error analyzing location: {str(e)}") return { 'assessment': 'error', 'completeness_score': 30, # Increased from 0 to 30 'location_quality': 'error', 'city_tier': 'unknown', 'landmarks_analysis': {'provided': False, 'count': 0, 'types': []}, 'verification_status': 'error', 'formatted_address': '', 'address_format_valid': False, 'address_in_city': False, 'city_in_state': False, 'state_in_country': False, 'postal_code_valid': False, 'postal_code_in_city': False, 'coordinates_valid': False, 'coordinates_in_city': False } def calculate_location_completeness(data): # Define weights for different fields weights = { 'address': 0.25, 'city': 0.20, 'state': 0.15, 'country': 0.05, 'zip': 0.10, 'latitude': 0.10, 'longitude': 0.10, 'nearby_landmarks': 0.05 } # Calculate weighted score score = 0 for field, weight in weights.items(): if data[field]: score += weight return int(score * 100)