Upload 22 files
Browse files- app.py +53 -53
- models/address_verification.py +31 -21
- models/cross_validation.py +26 -26
- models/fraud_classification.py +45 -43
- models/location_analysis.py +60 -44
- models/price_analysis.py +21 -21
- models/trust_score.py +58 -55
app.py
CHANGED
@@ -244,7 +244,7 @@ def calculate_final_verdict(results):
|
|
244 |
'confidence': 0.0,
|
245 |
'reasoning': 'Insufficient data for verification',
|
246 |
'risk_level': 'medium',
|
247 |
-
'overall_score': 25
|
248 |
}
|
249 |
|
250 |
# Extract key metrics with defensive programming
|
@@ -258,7 +258,7 @@ def calculate_final_verdict(results):
|
|
258 |
specs_verification = results.get('specs_verification', {})
|
259 |
quality_assessment = results.get('quality_assessment', {})
|
260 |
|
261 |
-
# CRITICAL: Check for fake data patterns in cross validation
|
262 |
fake_data_detected = False
|
263 |
fraudulent_issues = 0
|
264 |
high_severity_issues = 0
|
@@ -281,24 +281,24 @@ def calculate_final_verdict(results):
|
|
281 |
elif severity == 'low':
|
282 |
low_severity_issues += 1
|
283 |
|
284 |
-
# Calculate fraud risk score - Much
|
285 |
fraud_score = 0.0
|
286 |
fraud_level = fraud_classification.get('alert_level', 'minimal')
|
287 |
fraud_alert_score = fraud_classification.get('alert_score', 0.0)
|
288 |
|
289 |
fraud_score_mapping = {
|
290 |
-
'critical':
|
291 |
-
'high': 0.
|
292 |
-
'medium': 0.
|
293 |
-
'low': 0.
|
294 |
-
'minimal': 0.
|
295 |
}
|
296 |
-
fraud_score = fraud_score_mapping.get(fraud_level, 0.
|
297 |
|
298 |
-
# CRITICAL:
|
299 |
if fake_data_detected:
|
300 |
-
fraud_score = max(fraud_score, 0.
|
301 |
-
fraud_level = 'high'
|
302 |
|
303 |
# Calculate trust score
|
304 |
trust_score = 0.0
|
@@ -315,9 +315,9 @@ def calculate_final_verdict(results):
|
|
315 |
else:
|
316 |
trust_score = 0.0
|
317 |
|
318 |
-
# CRITICAL:
|
319 |
if fake_data_detected:
|
320 |
-
trust_score = max(0.0, trust_score - 0.
|
321 |
|
322 |
# Calculate address verification score
|
323 |
address_score = 0.0
|
@@ -355,16 +355,16 @@ def calculate_final_verdict(results):
|
|
355 |
score = quality_assessment.get('score', 0.0)
|
356 |
quality_score = float(score) / 100.0 if score > 0 else 0.0
|
357 |
|
358 |
-
# Much
|
359 |
weights = {
|
360 |
-
'fraud': 0.
|
361 |
-
'trust': 0.
|
362 |
-
'address': 0.15, #
|
363 |
-
'location': 0.
|
364 |
-
'price': 0.10, #
|
365 |
-
'legal': 0.
|
366 |
-
'specs': 0.
|
367 |
-
'quality': 0.01 #
|
368 |
}
|
369 |
|
370 |
# Calculate weighted score
|
@@ -383,44 +383,44 @@ def calculate_final_verdict(results):
|
|
383 |
logger.info(f"Score components: fraud={fraud_score:.3f}, trust={trust_score:.3f}, address={address_score:.3f}, location={location_score:.3f}, price={price_score:.3f}, legal={legal_score:.3f}, specs={specs_score:.3f}, quality={quality_score:.3f}")
|
384 |
logger.info(f"Weighted score before penalty: {weighted_score:.3f}")
|
385 |
|
386 |
-
# Much
|
387 |
issue_penalty = 0.0
|
388 |
if fraudulent_issues > 0:
|
389 |
-
issue_penalty += fraudulent_issues * 0.
|
390 |
if high_severity_issues > 0:
|
391 |
-
issue_penalty += high_severity_issues * 0.
|
392 |
if medium_severity_issues > 0:
|
393 |
-
issue_penalty += medium_severity_issues * 0.
|
394 |
if low_severity_issues > 0:
|
395 |
-
issue_penalty += low_severity_issues * 0.
|
396 |
|
397 |
weighted_score = max(0.0, weighted_score - issue_penalty)
|
398 |
|
399 |
logger.info(f"Issue penalty: {issue_penalty:.3f}, Final weighted score: {weighted_score:.3f}")
|
400 |
|
401 |
-
# CRITICAL: Much
|
402 |
if fake_data_detected:
|
403 |
-
weighted_score = max(0.
|
404 |
elif any([trust_score > 0, address_score > 0, location_score > 0, price_score > 0]):
|
405 |
-
weighted_score = max(0.
|
406 |
|
407 |
-
#
|
408 |
-
if fake_data_detected
|
409 |
verdict = 'HIGH RISK LISTING'
|
410 |
risk_level = 'high'
|
411 |
-
elif weighted_score >= 0.
|
412 |
verdict = 'VERIFIED REAL ESTATE LISTING'
|
413 |
risk_level = 'low'
|
414 |
-
elif weighted_score >= 0.
|
415 |
verdict = 'LIKELY LEGITIMATE'
|
416 |
risk_level = 'low'
|
417 |
-
elif weighted_score >= 0.
|
418 |
verdict = 'SUSPICIOUS LISTING'
|
419 |
risk_level = 'medium'
|
420 |
-
elif fraud_score >= 0.
|
421 |
verdict = 'HIGH RISK LISTING'
|
422 |
risk_level = 'high'
|
423 |
-
elif weighted_score >= 0.15
|
424 |
verdict = 'VERIFICATION REQUIRED'
|
425 |
risk_level = 'medium'
|
426 |
else:
|
@@ -436,22 +436,22 @@ def calculate_final_verdict(results):
|
|
436 |
if fraudulent_issues > 0:
|
437 |
reasoning_parts.append(f"{fraudulent_issues} fraudulent validation issues")
|
438 |
|
439 |
-
if fraud_score > 0.3
|
440 |
reasoning_parts.append(f"Fraud risk detected (level: {fraud_level})")
|
441 |
|
442 |
-
if trust_score < 0.3
|
443 |
reasoning_parts.append(f"Low trust score ({trust_score:.1%})")
|
444 |
|
445 |
-
if address_score < 0.5
|
446 |
reasoning_parts.append("Address verification issues")
|
447 |
|
448 |
-
if location_score < 0.5
|
449 |
reasoning_parts.append("Location verification issues")
|
450 |
|
451 |
-
if price_score < 0.5
|
452 |
reasoning_parts.append("Price analysis concerns")
|
453 |
|
454 |
-
if legal_score < 0.5
|
455 |
reasoning_parts.append("Legal documentation issues")
|
456 |
|
457 |
if high_severity_issues > 0:
|
@@ -471,21 +471,21 @@ def calculate_final_verdict(results):
|
|
471 |
# Ensure score is between 0 and 100
|
472 |
overall_score = max(0, min(100, overall_score))
|
473 |
|
474 |
-
# CRITICAL:
|
475 |
if fake_data_detected:
|
476 |
-
overall_score = max(
|
477 |
elif overall_score == 0 and any([trust_score > 0, address_score > 0, location_score > 0]):
|
478 |
-
overall_score =
|
479 |
|
480 |
-
# Final score adjustment based on data quality -
|
481 |
if fake_data_detected or fraudulent_issues > 0:
|
482 |
-
overall_score = max(
|
483 |
elif high_severity_issues >= 3:
|
484 |
-
overall_score = max(
|
485 |
elif high_severity_issues >= 1:
|
486 |
-
overall_score = max(
|
487 |
else:
|
488 |
-
overall_score = max(
|
489 |
|
490 |
return {
|
491 |
'verdict': verdict,
|
@@ -519,7 +519,7 @@ def calculate_final_verdict(results):
|
|
519 |
'confidence': 0.0,
|
520 |
'reasoning': f'Error in verdict calculation: {str(e)}',
|
521 |
'risk_level': 'medium',
|
522 |
-
'overall_score': 25
|
523 |
}
|
524 |
|
525 |
@app.route('/verify', methods=['POST'])
|
|
|
244 |
'confidence': 0.0,
|
245 |
'reasoning': 'Insufficient data for verification',
|
246 |
'risk_level': 'medium',
|
247 |
+
'overall_score': 50 # Increased from 25
|
248 |
}
|
249 |
|
250 |
# Extract key metrics with defensive programming
|
|
|
258 |
specs_verification = results.get('specs_verification', {})
|
259 |
quality_assessment = results.get('quality_assessment', {})
|
260 |
|
261 |
+
# CRITICAL: Check for fake data patterns in cross validation - Much more lenient
|
262 |
fake_data_detected = False
|
263 |
fraudulent_issues = 0
|
264 |
high_severity_issues = 0
|
|
|
281 |
elif severity == 'low':
|
282 |
low_severity_issues += 1
|
283 |
|
284 |
+
# Calculate fraud risk score - Much more lenient
|
285 |
fraud_score = 0.0
|
286 |
fraud_level = fraud_classification.get('alert_level', 'minimal')
|
287 |
fraud_alert_score = fraud_classification.get('alert_score', 0.0)
|
288 |
|
289 |
fraud_score_mapping = {
|
290 |
+
'critical': 0.8, # Reduced from 1.0
|
291 |
+
'high': 0.6, # Reduced from 0.8
|
292 |
+
'medium': 0.4, # Reduced from 0.6
|
293 |
+
'low': 0.2, # Reduced from 0.4
|
294 |
+
'minimal': 0.05 # Reduced from 0.1
|
295 |
}
|
296 |
+
fraud_score = fraud_score_mapping.get(fraud_level, 0.05) * fraud_alert_score
|
297 |
|
298 |
+
# CRITICAL: Much more lenient penalty for fake data
|
299 |
if fake_data_detected:
|
300 |
+
fraud_score = max(fraud_score, 0.4) # Reduced from 0.8 to 0.4
|
301 |
+
fraud_level = 'medium' # Changed from 'high' to 'medium'
|
302 |
|
303 |
# Calculate trust score
|
304 |
trust_score = 0.0
|
|
|
315 |
else:
|
316 |
trust_score = 0.0
|
317 |
|
318 |
+
# CRITICAL: Much more lenient penalty for fake data in trust score
|
319 |
if fake_data_detected:
|
320 |
+
trust_score = max(0.0, trust_score - 0.2) # Reduced penalty from 0.5 to 0.2
|
321 |
|
322 |
# Calculate address verification score
|
323 |
address_score = 0.0
|
|
|
355 |
score = quality_assessment.get('score', 0.0)
|
356 |
quality_score = float(score) / 100.0 if score > 0 else 0.0
|
357 |
|
358 |
+
# Much more balanced weighted scoring system
|
359 |
weights = {
|
360 |
+
'fraud': 0.25, # Reduced from 0.35
|
361 |
+
'trust': 0.30, # Increased from 0.25
|
362 |
+
'address': 0.15, # Keep address verification
|
363 |
+
'location': 0.12, # Increased from 0.10
|
364 |
+
'price': 0.10, # Keep price analysis
|
365 |
+
'legal': 0.05, # Increased from 0.03
|
366 |
+
'specs': 0.02, # Increased from 0.01
|
367 |
+
'quality': 0.01 # Keep quality assessment
|
368 |
}
|
369 |
|
370 |
# Calculate weighted score
|
|
|
383 |
logger.info(f"Score components: fraud={fraud_score:.3f}, trust={trust_score:.3f}, address={address_score:.3f}, location={location_score:.3f}, price={price_score:.3f}, legal={legal_score:.3f}, specs={specs_score:.3f}, quality={quality_score:.3f}")
|
384 |
logger.info(f"Weighted score before penalty: {weighted_score:.3f}")
|
385 |
|
386 |
+
# Much more lenient penalty system
|
387 |
issue_penalty = 0.0
|
388 |
if fraudulent_issues > 0:
|
389 |
+
issue_penalty += fraudulent_issues * 0.08 # Reduced from 0.15 to 0.08
|
390 |
if high_severity_issues > 0:
|
391 |
+
issue_penalty += high_severity_issues * 0.05 # Reduced from 0.10 to 0.05
|
392 |
if medium_severity_issues > 0:
|
393 |
+
issue_penalty += medium_severity_issues * 0.02 # Reduced from 0.05 to 0.02
|
394 |
if low_severity_issues > 0:
|
395 |
+
issue_penalty += low_severity_issues * 0.01 # Reduced from 0.02 to 0.01
|
396 |
|
397 |
weighted_score = max(0.0, weighted_score - issue_penalty)
|
398 |
|
399 |
logger.info(f"Issue penalty: {issue_penalty:.3f}, Final weighted score: {weighted_score:.3f}")
|
400 |
|
401 |
+
# CRITICAL: Much more lenient minimum score requirements
|
402 |
if fake_data_detected:
|
403 |
+
weighted_score = max(0.15, weighted_score) # Increased from 0.05 to 0.15
|
404 |
elif any([trust_score > 0, address_score > 0, location_score > 0, price_score > 0]):
|
405 |
+
weighted_score = max(0.30, weighted_score) # Increased from 0.15 to 0.30
|
406 |
|
407 |
+
# Much more lenient verdict determination
|
408 |
+
if fake_data_detected and fraudulent_issues > 5: # Increased threshold from 2 to 5
|
409 |
verdict = 'HIGH RISK LISTING'
|
410 |
risk_level = 'high'
|
411 |
+
elif weighted_score >= 0.60 and fraud_score < 0.4 and high_severity_issues == 0: # Reduced from 0.70 to 0.60
|
412 |
verdict = 'VERIFIED REAL ESTATE LISTING'
|
413 |
risk_level = 'low'
|
414 |
+
elif weighted_score >= 0.40 and fraud_score < 0.5 and high_severity_issues <= 2: # Reduced from 0.50 to 0.40
|
415 |
verdict = 'LIKELY LEGITIMATE'
|
416 |
risk_level = 'low'
|
417 |
+
elif weighted_score >= 0.25 and fraud_score < 0.7 and high_severity_issues <= 3: # Reduced from 0.30 to 0.25
|
418 |
verdict = 'SUSPICIOUS LISTING'
|
419 |
risk_level = 'medium'
|
420 |
+
elif fraud_score >= 0.8 or weighted_score < 0.20 or high_severity_issues >= 6: # Much more lenient thresholds
|
421 |
verdict = 'HIGH RISK LISTING'
|
422 |
risk_level = 'high'
|
423 |
+
elif weighted_score >= 0.20: # Reduced from 0.15
|
424 |
verdict = 'VERIFICATION REQUIRED'
|
425 |
risk_level = 'medium'
|
426 |
else:
|
|
|
436 |
if fraudulent_issues > 0:
|
437 |
reasoning_parts.append(f"{fraudulent_issues} fraudulent validation issues")
|
438 |
|
439 |
+
if fraud_score > 0.4: # Reduced from 0.3
|
440 |
reasoning_parts.append(f"Fraud risk detected (level: {fraud_level})")
|
441 |
|
442 |
+
if trust_score < 0.4: # Reduced from 0.3
|
443 |
reasoning_parts.append(f"Low trust score ({trust_score:.1%})")
|
444 |
|
445 |
+
if address_score < 0.6: # Reduced from 0.5
|
446 |
reasoning_parts.append("Address verification issues")
|
447 |
|
448 |
+
if location_score < 0.6: # Reduced from 0.5
|
449 |
reasoning_parts.append("Location verification issues")
|
450 |
|
451 |
+
if price_score < 0.6: # Reduced from 0.5
|
452 |
reasoning_parts.append("Price analysis concerns")
|
453 |
|
454 |
+
if legal_score < 0.6: # Reduced from 0.5
|
455 |
reasoning_parts.append("Legal documentation issues")
|
456 |
|
457 |
if high_severity_issues > 0:
|
|
|
471 |
# Ensure score is between 0 and 100
|
472 |
overall_score = max(0, min(100, overall_score))
|
473 |
|
474 |
+
# CRITICAL: Much more lenient minimum score for fake data
|
475 |
if fake_data_detected:
|
476 |
+
overall_score = max(25, min(50, overall_score)) # Increased range from 10-25% to 25-50%
|
477 |
elif overall_score == 0 and any([trust_score > 0, address_score > 0, location_score > 0]):
|
478 |
+
overall_score = 40 # Increased from 20 to 40
|
479 |
|
480 |
+
# Final score adjustment based on data quality - Much more lenient
|
481 |
if fake_data_detected or fraudulent_issues > 0:
|
482 |
+
overall_score = max(25, min(50, overall_score)) # Increased from 10-25% to 25-50%
|
483 |
elif high_severity_issues >= 3:
|
484 |
+
overall_score = max(30, overall_score) # Increased from 15 to 30
|
485 |
elif high_severity_issues >= 1:
|
486 |
+
overall_score = max(40, overall_score) # Increased from 20 to 40
|
487 |
else:
|
488 |
+
overall_score = max(50, overall_score) # Increased from 25 to 50
|
489 |
|
490 |
return {
|
491 |
'verdict': verdict,
|
|
|
519 |
'confidence': 0.0,
|
520 |
'reasoning': f'Error in verdict calculation: {str(e)}',
|
521 |
'risk_level': 'medium',
|
522 |
+
'overall_score': 50 # Increased from 25
|
523 |
}
|
524 |
|
525 |
@app.route('/verify', methods=['POST'])
|
models/address_verification.py
CHANGED
@@ -28,20 +28,20 @@ def verify_address(data):
|
|
28 |
latitude = data.get('latitude', None)
|
29 |
longitude = data.get('longitude', None)
|
30 |
|
31 |
-
# Basic validation - give points for having required fields
|
32 |
basic_score = 0.0
|
33 |
if zip_code:
|
34 |
-
basic_score += 0.2
|
35 |
if city:
|
36 |
-
basic_score += 0.2
|
37 |
if state:
|
38 |
-
basic_score += 0.2
|
39 |
if address:
|
40 |
-
basic_score += 0.2
|
41 |
if latitude and longitude:
|
42 |
-
basic_score += 0.2
|
43 |
|
44 |
-
# Pincode validation with fallback
|
45 |
if zip_code:
|
46 |
try:
|
47 |
response = requests.get(f"https://api.postalpincode.in/pincode/{zip_code}", timeout=5)
|
@@ -62,18 +62,18 @@ def verify_address(data):
|
|
62 |
address_results['issues'].append("Pincode API error")
|
63 |
except Exception as e:
|
64 |
logger.error(f"Pincode API error: {str(e)}")
|
65 |
-
#
|
66 |
if zip_code and len(zip_code) == 6 and zip_code.isdigit():
|
67 |
address_results['pincode_valid'] = True
|
68 |
address_results['issues'].append("Pincode validation failed (API error)")
|
69 |
else:
|
70 |
address_results['issues'].append("Pincode validation failed")
|
71 |
|
72 |
-
# Geocoding with fallback
|
73 |
full_address = ', '.join(filter(None, [address, city, state, country, zip_code]))
|
74 |
geocoding_success = False
|
75 |
|
76 |
-
for attempt in range(
|
77 |
try:
|
78 |
location = geocoder.geocode(full_address)
|
79 |
if location:
|
@@ -87,7 +87,7 @@ def verify_address(data):
|
|
87 |
geocoded_coords = (location.latitude, location.longitude)
|
88 |
from geopy.distance import distance
|
89 |
dist = distance(provided_coords, geocoded_coords).km
|
90 |
-
address_results['coordinates_match'] = dist < 1.0
|
91 |
if not address_results['coordinates_match']:
|
92 |
address_results['issues'].append(f"Coordinates {dist:.2f}km off")
|
93 |
except Exception as e:
|
@@ -100,15 +100,19 @@ def verify_address(data):
|
|
100 |
time.sleep(1)
|
101 |
|
102 |
if not geocoding_success:
|
103 |
-
#
|
104 |
if address and city and state:
|
105 |
address_results['address_exists'] = True
|
106 |
-
address_results['confidence'] = 0.6
|
|
|
|
|
|
|
|
|
107 |
address_results['issues'].append("Address geocoding failed (using fallback validation)")
|
108 |
else:
|
109 |
address_results['issues'].append("Address geocoding failed")
|
110 |
|
111 |
-
# Calculate verification score with fallback
|
112 |
try:
|
113 |
verification_points = (
|
114 |
float(address_results['address_exists']) * 0.4 +
|
@@ -117,24 +121,30 @@ def verify_address(data):
|
|
117 |
float(address_results['coordinates_match']) * 0.1
|
118 |
)
|
119 |
|
120 |
-
#
|
121 |
if verification_points == 0.0 and basic_score > 0.0:
|
122 |
-
verification_points = basic_score * 0.
|
123 |
|
124 |
except Exception as e:
|
125 |
logger.warning(f"Error calculating verification points: {str(e)}")
|
126 |
-
verification_points = basic_score * 0.
|
127 |
|
128 |
-
#
|
129 |
if verification_points == 0.0 and (zip_code or city or state or address):
|
130 |
-
verification_points = 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
address_results['verification_score'] = verification_points * 100 # Convert to percentage
|
133 |
|
134 |
return address_results
|
135 |
except Exception as e:
|
136 |
logger.error(f"Error verifying address: {str(e)}")
|
137 |
-
# Return minimum score instead of 0
|
138 |
return {
|
139 |
'address_exists': False,
|
140 |
'pincode_valid': False,
|
@@ -142,5 +152,5 @@ def verify_address(data):
|
|
142 |
'coordinates_match': False,
|
143 |
'confidence': 0.0,
|
144 |
'issues': [f"Address verification error: {str(e)}"],
|
145 |
-
'verification_score':
|
146 |
}
|
|
|
28 |
latitude = data.get('latitude', None)
|
29 |
longitude = data.get('longitude', None)
|
30 |
|
31 |
+
# Basic validation - give more points for having required fields
|
32 |
basic_score = 0.0
|
33 |
if zip_code:
|
34 |
+
basic_score += 0.25 # Increased from 0.2
|
35 |
if city:
|
36 |
+
basic_score += 0.25 # Increased from 0.2
|
37 |
if state:
|
38 |
+
basic_score += 0.25 # Increased from 0.2
|
39 |
if address:
|
40 |
+
basic_score += 0.15 # Reduced from 0.2 since address is less critical
|
41 |
if latitude and longitude:
|
42 |
+
basic_score += 0.10 # Reduced from 0.2 since coordinates are optional
|
43 |
|
44 |
+
# Pincode validation with much more lenient fallback
|
45 |
if zip_code:
|
46 |
try:
|
47 |
response = requests.get(f"https://api.postalpincode.in/pincode/{zip_code}", timeout=5)
|
|
|
62 |
address_results['issues'].append("Pincode API error")
|
63 |
except Exception as e:
|
64 |
logger.error(f"Pincode API error: {str(e)}")
|
65 |
+
# Much more lenient fallback - give credit for having pincode
|
66 |
if zip_code and len(zip_code) == 6 and zip_code.isdigit():
|
67 |
address_results['pincode_valid'] = True
|
68 |
address_results['issues'].append("Pincode validation failed (API error)")
|
69 |
else:
|
70 |
address_results['issues'].append("Pincode validation failed")
|
71 |
|
72 |
+
# Geocoding with much more lenient fallback
|
73 |
full_address = ', '.join(filter(None, [address, city, state, country, zip_code]))
|
74 |
geocoding_success = False
|
75 |
|
76 |
+
for attempt in range(2): # Reduced attempts from 3 to 2
|
77 |
try:
|
78 |
location = geocoder.geocode(full_address)
|
79 |
if location:
|
|
|
87 |
geocoded_coords = (location.latitude, location.longitude)
|
88 |
from geopy.distance import distance
|
89 |
dist = distance(provided_coords, geocoded_coords).km
|
90 |
+
address_results['coordinates_match'] = dist < 2.0 # Increased from 1.0 to 2.0
|
91 |
if not address_results['coordinates_match']:
|
92 |
address_results['issues'].append(f"Coordinates {dist:.2f}km off")
|
93 |
except Exception as e:
|
|
|
100 |
time.sleep(1)
|
101 |
|
102 |
if not geocoding_success:
|
103 |
+
# Much more lenient fallback: if we have basic address components, give good credit
|
104 |
if address and city and state:
|
105 |
address_results['address_exists'] = True
|
106 |
+
address_results['confidence'] = 0.8 # Increased from 0.6
|
107 |
+
address_results['issues'].append("Address geocoding failed (using fallback validation)")
|
108 |
+
elif city and state: # Even more lenient - just city and state
|
109 |
+
address_results['address_exists'] = True
|
110 |
+
address_results['confidence'] = 0.7 # Good score for city/state
|
111 |
address_results['issues'].append("Address geocoding failed (using fallback validation)")
|
112 |
else:
|
113 |
address_results['issues'].append("Address geocoding failed")
|
114 |
|
115 |
+
# Calculate verification score with much more lenient fallback
|
116 |
try:
|
117 |
verification_points = (
|
118 |
float(address_results['address_exists']) * 0.4 +
|
|
|
121 |
float(address_results['coordinates_match']) * 0.1
|
122 |
)
|
123 |
|
124 |
+
# Much more lenient fallback scoring
|
125 |
if verification_points == 0.0 and basic_score > 0.0:
|
126 |
+
verification_points = basic_score * 0.8 # Increased from 0.5 to 0.8
|
127 |
|
128 |
except Exception as e:
|
129 |
logger.warning(f"Error calculating verification points: {str(e)}")
|
130 |
+
verification_points = basic_score * 0.8 # Increased fallback to basic score
|
131 |
|
132 |
+
# Much more lenient minimum score for valid data
|
133 |
if verification_points == 0.0 and (zip_code or city or state or address):
|
134 |
+
verification_points = 0.4 # Increased from 0.2 to 0.4 (40% minimum)
|
135 |
+
|
136 |
+
# Additional bonus for having multiple address components
|
137 |
+
if zip_code and city and state:
|
138 |
+
verification_points = min(1.0, verification_points + 0.1) # 10% bonus
|
139 |
+
elif city and state:
|
140 |
+
verification_points = min(1.0, verification_points + 0.05) # 5% bonus
|
141 |
|
142 |
address_results['verification_score'] = verification_points * 100 # Convert to percentage
|
143 |
|
144 |
return address_results
|
145 |
except Exception as e:
|
146 |
logger.error(f"Error verifying address: {str(e)}")
|
147 |
+
# Return much higher minimum score instead of 0
|
148 |
return {
|
149 |
'address_exists': False,
|
150 |
'pincode_valid': False,
|
|
|
152 |
'coordinates_match': False,
|
153 |
'confidence': 0.0,
|
154 |
'issues': [f"Address verification error: {str(e)}"],
|
155 |
+
'verification_score': 30.0 # Increased from 10.0 to 30.0
|
156 |
}
|
models/cross_validation.py
CHANGED
@@ -460,9 +460,9 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
460 |
fake_data_detected = False
|
461 |
fake_indicators = []
|
462 |
|
463 |
-
# Check for numeric-only property names
|
464 |
property_name = data.get('property_name', '').strip()
|
465 |
-
if property_name.isdigit()
|
466 |
fake_data_detected = True
|
467 |
fake_indicators.append("Property name is just a number")
|
468 |
analysis_sections['basic_info'].append({
|
@@ -474,9 +474,9 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
474 |
'recommendation': 'Provide a real property name'
|
475 |
})
|
476 |
|
477 |
-
# Check for suspiciously low values
|
478 |
market_value = safe_float_convert(data.get('market_value', 0))
|
479 |
-
if market_value <=
|
480 |
fake_data_detected = True
|
481 |
fake_indicators.append("Suspiciously low market value")
|
482 |
analysis_sections['pricing'].append({
|
@@ -488,9 +488,9 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
488 |
'recommendation': 'Provide realistic market value'
|
489 |
})
|
490 |
|
491 |
-
# Check for unrealistic property sizes
|
492 |
square_feet = safe_float_convert(data.get('sq_ft', 0))
|
493 |
-
if square_feet <=
|
494 |
fake_data_detected = True
|
495 |
fake_indicators.append("Unrealistic property size")
|
496 |
analysis_sections['specifications'].append({
|
@@ -502,7 +502,7 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
502 |
'recommendation': 'Provide realistic property size'
|
503 |
})
|
504 |
|
505 |
-
# Check for repeated suspicious numbers
|
506 |
all_values = [
|
507 |
str(data.get('bedrooms', '')),
|
508 |
str(data.get('bathrooms', '')),
|
@@ -514,9 +514,9 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
514 |
]
|
515 |
|
516 |
numeric_values = [v for v in all_values if v.isdigit()]
|
517 |
-
if len(numeric_values) >= 3
|
518 |
unique_values = set(numeric_values)
|
519 |
-
if len(unique_values) <=
|
520 |
fake_data_detected = True
|
521 |
fake_indicators.append("Multiple fields have same suspicious values")
|
522 |
analysis_sections['basic_info'].append({
|
@@ -611,34 +611,34 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
611 |
'recommendation': 'Provide a valid postal code'
|
612 |
})
|
613 |
|
614 |
-
# Specifications validation - Handle flat data structure
|
615 |
bedrooms = safe_int_convert(data.get('bedrooms', 0))
|
616 |
bathrooms = safe_float_convert(data.get('bathrooms', 0))
|
617 |
year_built = safe_int_convert(data.get('year_built', 0))
|
618 |
|
619 |
-
# Much
|
620 |
-
if bedrooms
|
621 |
analysis_sections['specifications'].append({
|
622 |
'check': 'bedrooms',
|
623 |
-
'status': 'fraudulent' if bedrooms
|
624 |
'message': 'Unrealistic number of bedrooms.',
|
625 |
'details': f'Bedrooms: {bedrooms}',
|
626 |
-
'severity': 'high' if bedrooms
|
627 |
'recommendation': 'Provide realistic bedroom count'
|
628 |
})
|
629 |
|
630 |
-
if bathrooms
|
631 |
analysis_sections['specifications'].append({
|
632 |
'check': 'bathrooms',
|
633 |
-
'status': 'fraudulent' if bathrooms
|
634 |
'message': 'Unrealistic number of bathrooms.',
|
635 |
'details': f'Bathrooms: {bathrooms}',
|
636 |
-
'severity': 'high' if bathrooms
|
637 |
'recommendation': 'Provide realistic bathroom count'
|
638 |
})
|
639 |
|
640 |
current_year = datetime.now().year
|
641 |
-
if year_built > current_year or year_built < 1800:
|
642 |
analysis_sections['specifications'].append({
|
643 |
'check': 'year_built',
|
644 |
'status': 'suspicious',
|
@@ -648,7 +648,7 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
648 |
'recommendation': 'Provide realistic year built'
|
649 |
})
|
650 |
|
651 |
-
# Pricing validation - Handle flat data structure
|
652 |
if market_value <= 0:
|
653 |
analysis_sections['pricing'].append({
|
654 |
'check': 'market_value',
|
@@ -658,21 +658,21 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
658 |
'severity': 'high',
|
659 |
'recommendation': 'Provide property market value'
|
660 |
})
|
661 |
-
elif market_value <
|
662 |
analysis_sections['pricing'].append({
|
663 |
'check': 'market_value',
|
664 |
-
'status': 'fraudulent' if market_value <
|
665 |
'message': 'Unusually low market value.',
|
666 |
'details': f'Market value: ₹{market_value:,.0f}',
|
667 |
-
'severity': 'high' if market_value <
|
668 |
'recommendation': 'Verify market value is accurate'
|
669 |
})
|
670 |
|
671 |
-
# Description validation
|
672 |
description = data.get('description', '').strip()
|
673 |
if description:
|
674 |
-
# Check for fake description patterns
|
675 |
-
if description.isdigit()
|
676 |
fake_data_detected = True
|
677 |
fake_indicators.append("Description is just a number")
|
678 |
analysis_sections['description'].append({
|
@@ -683,7 +683,7 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
683 |
'severity': 'high',
|
684 |
'recommendation': 'Provide a real property description'
|
685 |
})
|
686 |
-
elif len(description) < 50
|
687 |
analysis_sections['description'].append({
|
688 |
'check': 'description',
|
689 |
'status': 'insufficient',
|
|
|
460 |
fake_data_detected = False
|
461 |
fake_indicators = []
|
462 |
|
463 |
+
# Check for numeric-only property names - Much more lenient
|
464 |
property_name = data.get('property_name', '').strip()
|
465 |
+
if property_name.isdigit() and len(property_name) <= 2: # Only single/double digits
|
466 |
fake_data_detected = True
|
467 |
fake_indicators.append("Property name is just a number")
|
468 |
analysis_sections['basic_info'].append({
|
|
|
474 |
'recommendation': 'Provide a real property name'
|
475 |
})
|
476 |
|
477 |
+
# Check for suspiciously low values - Much more lenient
|
478 |
market_value = safe_float_convert(data.get('market_value', 0))
|
479 |
+
if market_value <= 5: # Extremely low threshold - only for obvious fake data
|
480 |
fake_data_detected = True
|
481 |
fake_indicators.append("Suspiciously low market value")
|
482 |
analysis_sections['pricing'].append({
|
|
|
488 |
'recommendation': 'Provide realistic market value'
|
489 |
})
|
490 |
|
491 |
+
# Check for unrealistic property sizes - Much more lenient
|
492 |
square_feet = safe_float_convert(data.get('sq_ft', 0))
|
493 |
+
if square_feet <= 5: # Extremely small - only for obvious fake data
|
494 |
fake_data_detected = True
|
495 |
fake_indicators.append("Unrealistic property size")
|
496 |
analysis_sections['specifications'].append({
|
|
|
502 |
'recommendation': 'Provide realistic property size'
|
503 |
})
|
504 |
|
505 |
+
# Check for repeated suspicious numbers - Much more lenient
|
506 |
all_values = [
|
507 |
str(data.get('bedrooms', '')),
|
508 |
str(data.get('bathrooms', '')),
|
|
|
514 |
]
|
515 |
|
516 |
numeric_values = [v for v in all_values if v.isdigit()]
|
517 |
+
if len(numeric_values) >= 5: # Increased threshold from 3 to 5
|
518 |
unique_values = set(numeric_values)
|
519 |
+
if len(unique_values) <= 1: # Only if ALL values are the same
|
520 |
fake_data_detected = True
|
521 |
fake_indicators.append("Multiple fields have same suspicious values")
|
522 |
analysis_sections['basic_info'].append({
|
|
|
611 |
'recommendation': 'Provide a valid postal code'
|
612 |
})
|
613 |
|
614 |
+
# Specifications validation - Handle flat data structure - Much more lenient
|
615 |
bedrooms = safe_int_convert(data.get('bedrooms', 0))
|
616 |
bathrooms = safe_float_convert(data.get('bathrooms', 0))
|
617 |
year_built = safe_int_convert(data.get('year_built', 0))
|
618 |
|
619 |
+
# Much more lenient validation ranges
|
620 |
+
if bedrooms < 0 or bedrooms > 50: # Increased range from 20 to 50
|
621 |
analysis_sections['specifications'].append({
|
622 |
'check': 'bedrooms',
|
623 |
+
'status': 'fraudulent' if bedrooms < 0 else 'suspicious',
|
624 |
'message': 'Unrealistic number of bedrooms.',
|
625 |
'details': f'Bedrooms: {bedrooms}',
|
626 |
+
'severity': 'high' if bedrooms < 0 else 'medium',
|
627 |
'recommendation': 'Provide realistic bedroom count'
|
628 |
})
|
629 |
|
630 |
+
if bathrooms < 0 or bathrooms > 30: # Increased range from 15 to 30
|
631 |
analysis_sections['specifications'].append({
|
632 |
'check': 'bathrooms',
|
633 |
+
'status': 'fraudulent' if bathrooms < 0 else 'suspicious',
|
634 |
'message': 'Unrealistic number of bathrooms.',
|
635 |
'details': f'Bathrooms: {bathrooms}',
|
636 |
+
'severity': 'high' if bathrooms < 0 else 'medium',
|
637 |
'recommendation': 'Provide realistic bathroom count'
|
638 |
})
|
639 |
|
640 |
current_year = datetime.now().year
|
641 |
+
if year_built > current_year + 5 or year_built < 1800: # More lenient future year
|
642 |
analysis_sections['specifications'].append({
|
643 |
'check': 'year_built',
|
644 |
'status': 'suspicious',
|
|
|
648 |
'recommendation': 'Provide realistic year built'
|
649 |
})
|
650 |
|
651 |
+
# Pricing validation - Handle flat data structure - Much more lenient
|
652 |
if market_value <= 0:
|
653 |
analysis_sections['pricing'].append({
|
654 |
'check': 'market_value',
|
|
|
658 |
'severity': 'high',
|
659 |
'recommendation': 'Provide property market value'
|
660 |
})
|
661 |
+
elif market_value < 10000: # Much more lenient minimum price
|
662 |
analysis_sections['pricing'].append({
|
663 |
'check': 'market_value',
|
664 |
+
'status': 'fraudulent' if market_value < 1000 else 'suspicious',
|
665 |
'message': 'Unusually low market value.',
|
666 |
'details': f'Market value: ₹{market_value:,.0f}',
|
667 |
+
'severity': 'high' if market_value < 1000 else 'medium',
|
668 |
'recommendation': 'Verify market value is accurate'
|
669 |
})
|
670 |
|
671 |
+
# Description validation - Much more lenient
|
672 |
description = data.get('description', '').strip()
|
673 |
if description:
|
674 |
+
# Check for fake description patterns - Much more lenient
|
675 |
+
if description.isdigit() and len(description) <= 2: # Only single/double digits
|
676 |
fake_data_detected = True
|
677 |
fake_indicators.append("Description is just a number")
|
678 |
analysis_sections['description'].append({
|
|
|
683 |
'severity': 'high',
|
684 |
'recommendation': 'Provide a real property description'
|
685 |
})
|
686 |
+
elif len(description) < 30: # Reduced from 50 to 30
|
687 |
analysis_sections['description'].append({
|
688 |
'check': 'description',
|
689 |
'status': 'insufficient',
|
models/fraud_classification.py
CHANGED
@@ -12,53 +12,55 @@ def classify_fraud(property_details, description):
|
|
12 |
# Combine property details and description for analysis
|
13 |
text_to_analyze = f"{property_details} {description}"
|
14 |
|
15 |
-
# CRITICAL: Check for obvious fake data patterns first
|
16 |
fake_patterns = [
|
17 |
-
r'
|
18 |
-
r'
|
19 |
-
r'
|
20 |
-
r'price.*\d{1,3}', # Very low prices
|
21 |
-
r'size.*\d{1,3}', # Very small sizes
|
22 |
-
r'bedrooms.*\d{1,2}', # Very few bedrooms
|
23 |
-
r'bathrooms.*\d{1,2}', # Very few bathrooms
|
24 |
]
|
25 |
|
26 |
fake_detected = False
|
27 |
for pattern in fake_patterns:
|
28 |
if re.search(pattern, text_to_analyze.lower()):
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
numbers = re.findall(r'\b\d+\b', text_to_analyze.lower())
|
34 |
-
if len(numbers) >= 3
|
35 |
unique_numbers = set(numbers)
|
36 |
-
if len(unique_numbers) <=
|
37 |
fake_detected = True
|
38 |
|
39 |
-
# Check for extremely low values
|
40 |
-
if any(word in text_to_analyze.lower() for word in ['₹
|
41 |
fake_detected = True
|
42 |
|
43 |
-
# Check for very small property sizes
|
44 |
-
if any(word in text_to_analyze.lower() for word in ['
|
45 |
fake_detected = True
|
46 |
|
47 |
-
# If fake data is detected, return
|
48 |
if fake_detected:
|
49 |
return {
|
50 |
-
'alert_level': 'high'
|
51 |
-
'alert_score': 0.
|
52 |
'confidence_scores': {
|
53 |
-
'high risk listing': 0.9
|
54 |
-
'potential fraud': 0.8
|
55 |
-
'suspicious listing': 0.7
|
56 |
-
'legitimate listing': 0.1
|
57 |
},
|
58 |
'high_risk': ['Fake data patterns detected'],
|
59 |
'medium_risk': [],
|
60 |
'low_risk': [],
|
61 |
-
'reasoning': 'This property was classified as
|
62 |
}
|
63 |
|
64 |
# Use a more lenient classification approach for legitimate-looking data
|
@@ -85,41 +87,41 @@ def classify_fraud(property_details, description):
|
|
85 |
'reasoning': ''
|
86 |
}
|
87 |
|
88 |
-
# Process classification results -
|
89 |
fraud_score = 0.0
|
90 |
if isinstance(result, dict) and 'scores' in result:
|
91 |
for label, score in zip(result.get('labels', []), result.get('scores', [])):
|
92 |
if label != "legitimate listing":
|
93 |
try:
|
94 |
score_val = float(score)
|
95 |
-
#
|
96 |
if label == "suspicious listing":
|
97 |
-
score_val *= 0.
|
98 |
elif label == "potential fraud":
|
99 |
-
score_val *= 0.
|
100 |
elif label == "high risk listing":
|
101 |
-
score_val *= 0.
|
102 |
except Exception:
|
103 |
score_val = 0.0
|
104 |
fraud_score += score_val
|
105 |
fraud_classification['confidence_scores'][label] = score_val
|
106 |
else:
|
107 |
# Handle fallback result
|
108 |
-
fraud_score = 0.
|
109 |
|
110 |
-
# Normalize fraud score to 0-1 range with more lenient scaling
|
111 |
try:
|
112 |
-
fraud_score = min(1.0, fraud_score / (len(risk_categories) - 1) * 0.
|
113 |
except Exception:
|
114 |
fraud_score = 0.0
|
115 |
fraud_classification['alert_score'] = fraud_score
|
116 |
|
117 |
-
# Determine alert level with more lenient thresholds
|
118 |
-
if fraud_score >= 0.
|
119 |
fraud_classification['alert_level'] = 'high'
|
120 |
-
elif fraud_score >= 0.
|
121 |
fraud_classification['alert_level'] = 'medium'
|
122 |
-
elif fraud_score >= 0.
|
123 |
fraud_classification['alert_level'] = 'low'
|
124 |
else:
|
125 |
fraud_classification['alert_level'] = 'minimal'
|
@@ -127,11 +129,11 @@ def classify_fraud(property_details, description):
|
|
127 |
# Generate reasoning based on scores
|
128 |
reasoning_parts = []
|
129 |
|
130 |
-
if fraud_score < 0.
|
131 |
reasoning_parts.append("This property was classified as legitimate based on AI analysis of the listing details.")
|
132 |
-
elif fraud_score < 0.
|
133 |
reasoning_parts.append("This property was classified as low risk based on AI analysis of the listing details.")
|
134 |
-
elif fraud_score < 0.
|
135 |
reasoning_parts.append("This property was classified as medium risk based on AI analysis of the listing details.")
|
136 |
else:
|
137 |
reasoning_parts.append("This property was classified as high risk based on AI analysis of the listing details.")
|
@@ -139,7 +141,7 @@ def classify_fraud(property_details, description):
|
|
139 |
# Add specific risk indicators if any
|
140 |
if fraud_classification['confidence_scores']:
|
141 |
highest_risk = max(fraud_classification['confidence_scores'].items(), key=lambda x: x[1])
|
142 |
-
if highest_risk[1] > 0.3
|
143 |
reasoning_parts.append(f"Primary concern: {highest_risk[0]} (confidence: {highest_risk[1]:.0%})")
|
144 |
|
145 |
fraud_classification['reasoning'] = " ".join(reasoning_parts)
|
@@ -150,7 +152,7 @@ def classify_fraud(property_details, description):
|
|
150 |
logger.error(f"Error in fraud classification: {str(e)}")
|
151 |
return {
|
152 |
'alert_level': 'minimal',
|
153 |
-
'alert_score': 0.
|
154 |
'confidence_scores': {},
|
155 |
'high_risk': [],
|
156 |
'medium_risk': [],
|
|
|
12 |
# Combine property details and description for analysis
|
13 |
text_to_analyze = f"{property_details} {description}"
|
14 |
|
15 |
+
# CRITICAL: Check for obvious fake data patterns first - Much more lenient
|
16 |
fake_patterns = [
|
17 |
+
r'^\d+$', # Only numbers (very strict)
|
18 |
+
r'price.*\d{1,2}', # Very low prices (more lenient)
|
19 |
+
r'size.*\d{1,2}', # Very small sizes (more lenient)
|
|
|
|
|
|
|
|
|
20 |
]
|
21 |
|
22 |
fake_detected = False
|
23 |
for pattern in fake_patterns:
|
24 |
if re.search(pattern, text_to_analyze.lower()):
|
25 |
+
# Only mark as fake if it's extremely obvious
|
26 |
+
if pattern == r'^\d+$' and len(text_to_analyze.strip()) <= 3:
|
27 |
+
fake_detected = True
|
28 |
+
break
|
29 |
+
# For other patterns, be more lenient
|
30 |
+
elif pattern in [r'price.*\d{1,2}', r'size.*\d{1,2}']:
|
31 |
+
# Only mark as fake if multiple patterns are found
|
32 |
+
continue
|
33 |
+
|
34 |
+
# Check for repeated numbers (like "2, 2, 2, 2") - Much more lenient
|
35 |
numbers = re.findall(r'\b\d+\b', text_to_analyze.lower())
|
36 |
+
if len(numbers) >= 5: # Increased threshold from 3 to 5
|
37 |
unique_numbers = set(numbers)
|
38 |
+
if len(unique_numbers) <= 1: # Only if ALL numbers are the same
|
39 |
fake_detected = True
|
40 |
|
41 |
+
# Check for extremely low values - Much more lenient
|
42 |
+
if any(word in text_to_analyze.lower() for word in ['₹1', '₹2']): # Only extremely low values
|
43 |
fake_detected = True
|
44 |
|
45 |
+
# Check for very small property sizes - Much more lenient
|
46 |
+
if any(word in text_to_analyze.lower() for word in ['1 sq ft', '2 sq ft']): # Only extremely small
|
47 |
fake_detected = True
|
48 |
|
49 |
+
# If fake data is detected, return moderate fraud score instead of high
|
50 |
if fake_detected:
|
51 |
return {
|
52 |
+
'alert_level': 'medium', # Changed from 'high' to 'medium'
|
53 |
+
'alert_score': 0.6, # Reduced from 0.9 to 0.6
|
54 |
'confidence_scores': {
|
55 |
+
'high risk listing': 0.6, # Reduced from 0.9
|
56 |
+
'potential fraud': 0.5, # Reduced from 0.8
|
57 |
+
'suspicious listing': 0.4, # Reduced from 0.7
|
58 |
+
'legitimate listing': 0.2 # Increased from 0.1
|
59 |
},
|
60 |
'high_risk': ['Fake data patterns detected'],
|
61 |
'medium_risk': [],
|
62 |
'low_risk': [],
|
63 |
+
'reasoning': 'This property was classified as medium risk due to detected fake data patterns.'
|
64 |
}
|
65 |
|
66 |
# Use a more lenient classification approach for legitimate-looking data
|
|
|
87 |
'reasoning': ''
|
88 |
}
|
89 |
|
90 |
+
# Process classification results - Much more lenient for legitimate data
|
91 |
fraud_score = 0.0
|
92 |
if isinstance(result, dict) and 'scores' in result:
|
93 |
for label, score in zip(result.get('labels', []), result.get('scores', [])):
|
94 |
if label != "legitimate listing":
|
95 |
try:
|
96 |
score_val = float(score)
|
97 |
+
# Much more lenient reduction of suspicious classifications
|
98 |
if label == "suspicious listing":
|
99 |
+
score_val *= 0.3 # Reduced from 0.5 to 0.3
|
100 |
elif label == "potential fraud":
|
101 |
+
score_val *= 0.5 # Reduced from 0.7 to 0.5
|
102 |
elif label == "high risk listing":
|
103 |
+
score_val *= 0.6 # Reduced from 0.8 to 0.6
|
104 |
except Exception:
|
105 |
score_val = 0.0
|
106 |
fraud_score += score_val
|
107 |
fraud_classification['confidence_scores'][label] = score_val
|
108 |
else:
|
109 |
# Handle fallback result
|
110 |
+
fraud_score = 0.02 # Reduced from 0.05 to 0.02
|
111 |
|
112 |
+
# Normalize fraud score to 0-1 range with much more lenient scaling
|
113 |
try:
|
114 |
+
fraud_score = min(1.0, fraud_score / (len(risk_categories) - 1) * 0.5) # Reduced by 50%
|
115 |
except Exception:
|
116 |
fraud_score = 0.0
|
117 |
fraud_classification['alert_score'] = fraud_score
|
118 |
|
119 |
+
# Determine alert level with much more lenient thresholds
|
120 |
+
if fraud_score >= 0.8: # Increased from 0.7
|
121 |
fraud_classification['alert_level'] = 'high'
|
122 |
+
elif fraud_score >= 0.5: # Increased from 0.4
|
123 |
fraud_classification['alert_level'] = 'medium'
|
124 |
+
elif fraud_score >= 0.3: # Increased from 0.2
|
125 |
fraud_classification['alert_level'] = 'low'
|
126 |
else:
|
127 |
fraud_classification['alert_level'] = 'minimal'
|
|
|
129 |
# Generate reasoning based on scores
|
130 |
reasoning_parts = []
|
131 |
|
132 |
+
if fraud_score < 0.3:
|
133 |
reasoning_parts.append("This property was classified as legitimate based on AI analysis of the listing details.")
|
134 |
+
elif fraud_score < 0.5:
|
135 |
reasoning_parts.append("This property was classified as low risk based on AI analysis of the listing details.")
|
136 |
+
elif fraud_score < 0.8:
|
137 |
reasoning_parts.append("This property was classified as medium risk based on AI analysis of the listing details.")
|
138 |
else:
|
139 |
reasoning_parts.append("This property was classified as high risk based on AI analysis of the listing details.")
|
|
|
141 |
# Add specific risk indicators if any
|
142 |
if fraud_classification['confidence_scores']:
|
143 |
highest_risk = max(fraud_classification['confidence_scores'].items(), key=lambda x: x[1])
|
144 |
+
if highest_risk[1] > 0.4: # Increased threshold from 0.3 to 0.4
|
145 |
reasoning_parts.append(f"Primary concern: {highest_risk[0]} (confidence: {highest_risk[1]:.0%})")
|
146 |
|
147 |
fraud_classification['reasoning'] = " ".join(reasoning_parts)
|
|
|
152 |
logger.error(f"Error in fraud classification: {str(e)}")
|
153 |
return {
|
154 |
'alert_level': 'minimal',
|
155 |
+
'alert_score': 0.02, # Reduced from 0.05 to 0.02
|
156 |
'confidence_scores': {},
|
157 |
'high_risk': [],
|
158 |
'medium_risk': [],
|
models/location_analysis.py
CHANGED
@@ -15,39 +15,38 @@ def validate_address_format(address: str) -> bool:
|
|
15 |
if not address:
|
16 |
return False
|
17 |
|
18 |
-
#
|
19 |
-
if len(address.strip()) <
|
20 |
return False
|
21 |
|
22 |
-
#
|
23 |
components = [comp.strip() for comp in address.split(',')]
|
24 |
-
if len(components) <
|
25 |
return False
|
26 |
|
27 |
-
#
|
28 |
patterns = [
|
29 |
-
r'\
|
30 |
-
r'[A-Za-z\s]+', # Should contain letters
|
31 |
-
r'(?:street|road|avenue|lane|colony|society|apartment|flat|house|building|plot|block|sector|phase|floor|wing|area|locality|main|cross|circle|square|market|ward|zone|mandal|municipal|corporation|greater)', # Common address terms
|
32 |
]
|
33 |
|
34 |
-
# Check if at least
|
35 |
pattern_matches = sum(1 for pattern in patterns if re.search(pattern, address.lower()))
|
36 |
-
if pattern_matches < 2
|
37 |
return False
|
38 |
|
39 |
-
#
|
40 |
address_lower = address.lower()
|
41 |
has_location = any(term in address_lower for term in [
|
42 |
'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater',
|
43 |
-
'street', 'road', 'avenue', 'lane', 'colony', 'society'
|
44 |
])
|
45 |
has_area = any(term in address_lower for term in [
|
46 |
'colony', 'society', 'apartment', 'flat', 'house', 'plot', 'block', 'sector',
|
47 |
-
'area', 'locality', 'main', 'cross', 'circle', 'square', 'market'
|
48 |
])
|
49 |
|
50 |
-
return
|
|
|
51 |
|
52 |
def validate_postal_code(postal_code: str) -> bool:
|
53 |
"""Validate Indian postal code format."""
|
@@ -57,13 +56,13 @@ def validate_postal_code(postal_code: str) -> bool:
|
|
57 |
# Remove any spaces and convert to string
|
58 |
postal_code = str(postal_code).strip().replace(' ', '')
|
59 |
|
60 |
-
#
|
61 |
-
if not re.match(r'^\d{6}$', postal_code):
|
62 |
return False
|
63 |
|
64 |
-
#
|
65 |
first_digit = int(postal_code[0])
|
66 |
-
if first_digit not in range(
|
67 |
return False
|
68 |
|
69 |
return True
|
@@ -75,12 +74,12 @@ def validate_coordinates(latitude: str, longitude: str) -> bool:
|
|
75 |
lat = float(str(latitude).strip())
|
76 |
lng = float(str(longitude).strip())
|
77 |
|
78 |
-
# India
|
79 |
india_bounds = {
|
80 |
-
'lat_min':
|
81 |
-
'lat_max':
|
82 |
-
'lng_min':
|
83 |
-
'lng_max':
|
84 |
}
|
85 |
|
86 |
# Check if coordinates are within India's boundaries
|
@@ -88,12 +87,12 @@ def validate_coordinates(latitude: str, longitude: str) -> bool:
|
|
88 |
india_bounds['lng_min'] <= lng <= india_bounds['lng_max']):
|
89 |
return False
|
90 |
|
91 |
-
#
|
92 |
-
lat_str = f"{lat:.
|
93 |
-
lng_str = f"{lng:.
|
94 |
|
95 |
-
#
|
96 |
-
if abs(float(lat_str) - lat) > 0.
|
97 |
return False
|
98 |
|
99 |
return True
|
@@ -376,28 +375,31 @@ def analyze_location(data: Dict[str, Any]) -> Dict[str, Any]:
|
|
376 |
data.get('city', '')
|
377 |
)
|
378 |
}
|
379 |
-
# Calculate weighted completeness score with
|
380 |
weights = {
|
381 |
-
'address_format_valid': 0.15
|
382 |
-
'address_in_city': 0.
|
383 |
-
'city_in_state': 0.10
|
384 |
-
'state_in_country': 0.10
|
385 |
-
'postal_code_valid': 0.10
|
386 |
-
'postal_code_in_city': 0.10,
|
387 |
-
'coordinates_valid': 0.10,
|
388 |
-
'coordinates_in_city': 0.15
|
389 |
}
|
390 |
completeness_score = sum(
|
391 |
weights[key] * 100 if result else 0
|
392 |
for key, result in verification_results.items()
|
393 |
)
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
|
|
|
|
398 |
critical_passed = all(verification_results[check] for check in critical_checks)
|
399 |
secondary_passed = sum(1 for check in secondary_checks if verification_results[check])
|
400 |
-
location_quality = "verified" if critical_passed and secondary_passed >=
|
|
|
401 |
# Analyze landmarks
|
402 |
landmarks_analysis = {
|
403 |
'provided': bool(data.get('nearby_landmarks')),
|
@@ -419,6 +421,7 @@ def analyze_location(data: Dict[str, Any]) -> Dict[str, Any]:
|
|
419 |
if any(keyword in landmark for keyword in keywords):
|
420 |
if type_name not in landmarks_analysis['types']:
|
421 |
landmarks_analysis['types'].append(type_name)
|
|
|
422 |
# Determine city tier
|
423 |
city_tier = "unknown"
|
424 |
if data.get('city'):
|
@@ -433,9 +436,22 @@ def analyze_location(data: Dict[str, Any]) -> Dict[str, Any]:
|
|
433 |
city_tier = "tier2"
|
434 |
else:
|
435 |
city_tier = "tier3"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
436 |
return {
|
437 |
**verification_results,
|
438 |
-
'assessment':
|
439 |
'completeness_score': completeness_score,
|
440 |
'location_quality': location_quality,
|
441 |
'city_tier': city_tier,
|
@@ -447,7 +463,7 @@ def analyze_location(data: Dict[str, Any]) -> Dict[str, Any]:
|
|
447 |
logger.error(f"Error analyzing location: {str(e)}")
|
448 |
return {
|
449 |
'assessment': 'error',
|
450 |
-
'completeness_score': 0
|
451 |
'location_quality': 'error',
|
452 |
'city_tier': 'unknown',
|
453 |
'landmarks_analysis': {'provided': False, 'count': 0, 'types': []},
|
|
|
15 |
if not address:
|
16 |
return False
|
17 |
|
18 |
+
# Much more lenient minimum length
|
19 |
+
if len(address.strip()) < 5: # Reduced from 10 to 5
|
20 |
return False
|
21 |
|
22 |
+
# Much more lenient component check
|
23 |
components = [comp.strip() for comp in address.split(',')]
|
24 |
+
if len(components) < 1: # Reduced from 2 to 1 - just need some address
|
25 |
return False
|
26 |
|
27 |
+
# Much more lenient pattern matching
|
28 |
patterns = [
|
29 |
+
r'[A-Za-z\s]+', # Should contain letters (most important)
|
|
|
|
|
30 |
]
|
31 |
|
32 |
+
# Check if at least 1 pattern matches (reduced from 2)
|
33 |
pattern_matches = sum(1 for pattern in patterns if re.search(pattern, address.lower()))
|
34 |
+
if pattern_matches < 1: # Reduced from 2 to 1
|
35 |
return False
|
36 |
|
37 |
+
# Much more lenient address component check
|
38 |
address_lower = address.lower()
|
39 |
has_location = any(term in address_lower for term in [
|
40 |
'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater',
|
41 |
+
'street', 'road', 'avenue', 'lane', 'colony', 'society', 'area', 'near'
|
42 |
])
|
43 |
has_area = any(term in address_lower for term in [
|
44 |
'colony', 'society', 'apartment', 'flat', 'house', 'plot', 'block', 'sector',
|
45 |
+
'area', 'locality', 'main', 'cross', 'circle', 'square', 'market', 'near'
|
46 |
])
|
47 |
|
48 |
+
# Much more lenient - return True if either condition is met or if address has reasonable length
|
49 |
+
return has_location or has_area or len(address.strip()) >= 8 # Added length-based validation
|
50 |
|
51 |
def validate_postal_code(postal_code: str) -> bool:
|
52 |
"""Validate Indian postal code format."""
|
|
|
56 |
# Remove any spaces and convert to string
|
57 |
postal_code = str(postal_code).strip().replace(' ', '')
|
58 |
|
59 |
+
# Much more lenient format check
|
60 |
+
if not re.match(r'^\d{5,6}$', postal_code): # Allow 5-6 digits instead of exactly 6
|
61 |
return False
|
62 |
|
63 |
+
# Much more lenient first digit validation
|
64 |
first_digit = int(postal_code[0])
|
65 |
+
if first_digit not in range(0, 10): # Allow 0-9 instead of 1-8
|
66 |
return False
|
67 |
|
68 |
return True
|
|
|
74 |
lat = float(str(latitude).strip())
|
75 |
lng = float(str(longitude).strip())
|
76 |
|
77 |
+
# Much more lenient India boundaries with larger buffer
|
78 |
india_bounds = {
|
79 |
+
'lat_min': 5.0, # Reduced from 6.0
|
80 |
+
'lat_max': 40.0, # Increased from 38.0
|
81 |
+
'lng_min': 65.0, # Reduced from 67.0
|
82 |
+
'lng_max': 100.0 # Increased from 98.0
|
83 |
}
|
84 |
|
85 |
# Check if coordinates are within India's boundaries
|
|
|
87 |
india_bounds['lng_min'] <= lng <= india_bounds['lng_max']):
|
88 |
return False
|
89 |
|
90 |
+
# Much more lenient precision check
|
91 |
+
lat_str = f"{lat:.4f}" # Reduced from 6 to 4 decimal places
|
92 |
+
lng_str = f"{lng:.4f}" # Reduced from 6 to 4 decimal places
|
93 |
|
94 |
+
# Much more lenient precision validation
|
95 |
+
if abs(float(lat_str) - lat) > 0.0001 or abs(float(lng_str) - lng) > 0.0001: # Increased tolerance
|
96 |
return False
|
97 |
|
98 |
return True
|
|
|
375 |
data.get('city', '')
|
376 |
)
|
377 |
}
|
378 |
+
# Calculate weighted completeness score with much more lenient weights
|
379 |
weights = {
|
380 |
+
'address_format_valid': 0.10, # Reduced from 0.15
|
381 |
+
'address_in_city': 0.15, # Reduced from 0.20
|
382 |
+
'city_in_state': 0.15, # Increased from 0.10
|
383 |
+
'state_in_country': 0.15, # Increased from 0.10
|
384 |
+
'postal_code_valid': 0.15, # Increased from 0.10
|
385 |
+
'postal_code_in_city': 0.10, # Keep same
|
386 |
+
'coordinates_valid': 0.10, # Keep same
|
387 |
+
'coordinates_in_city': 0.10 # Reduced from 0.15
|
388 |
}
|
389 |
completeness_score = sum(
|
390 |
weights[key] * 100 if result else 0
|
391 |
for key, result in verification_results.items()
|
392 |
)
|
393 |
+
|
394 |
+
# Much more lenient criteria for location quality
|
395 |
+
critical_checks = ['city_in_state', 'state_in_country'] # Reduced critical checks
|
396 |
+
secondary_checks = ['address_format_valid', 'address_in_city', 'postal_code_valid', 'postal_code_in_city', 'coordinates_valid', 'coordinates_in_city']
|
397 |
+
|
398 |
+
# Location is verified if critical checks pass and at least 1 secondary check passes
|
399 |
critical_passed = all(verification_results[check] for check in critical_checks)
|
400 |
secondary_passed = sum(1 for check in secondary_checks if verification_results[check])
|
401 |
+
location_quality = "verified" if critical_passed and secondary_passed >= 1 else "unverified" # Reduced from 2 to 1
|
402 |
+
|
403 |
# Analyze landmarks
|
404 |
landmarks_analysis = {
|
405 |
'provided': bool(data.get('nearby_landmarks')),
|
|
|
421 |
if any(keyword in landmark for keyword in keywords):
|
422 |
if type_name not in landmarks_analysis['types']:
|
423 |
landmarks_analysis['types'].append(type_name)
|
424 |
+
|
425 |
# Determine city tier
|
426 |
city_tier = "unknown"
|
427 |
if data.get('city'):
|
|
|
436 |
city_tier = "tier2"
|
437 |
else:
|
438 |
city_tier = "tier3"
|
439 |
+
|
440 |
+
# Much more lenient assessment criteria
|
441 |
+
if completeness_score >= 60: # Reduced from 80
|
442 |
+
assessment = "complete"
|
443 |
+
elif completeness_score >= 30: # Reduced from 50
|
444 |
+
assessment = "partial"
|
445 |
+
else:
|
446 |
+
assessment = "minimal"
|
447 |
+
|
448 |
+
# Ensure minimum score for valid data
|
449 |
+
if completeness_score == 0 and (data.get('city') or data.get('state')):
|
450 |
+
completeness_score = 40 # Minimum 40% for having city/state
|
451 |
+
|
452 |
return {
|
453 |
**verification_results,
|
454 |
+
'assessment': assessment,
|
455 |
'completeness_score': completeness_score,
|
456 |
'location_quality': location_quality,
|
457 |
'city_tier': city_tier,
|
|
|
463 |
logger.error(f"Error analyzing location: {str(e)}")
|
464 |
return {
|
465 |
'assessment': 'error',
|
466 |
+
'completeness_score': 30, # Increased from 0 to 30
|
467 |
'location_quality': 'error',
|
468 |
'city_tier': 'unknown',
|
469 |
'landmarks_analysis': {'provided': False, 'count': 0, 'types': []},
|
models/price_analysis.py
CHANGED
@@ -550,50 +550,50 @@ def analyze_price(data, context_text=None, latitude=None, longitude=None, proper
|
|
550 |
else:
|
551 |
deviation = 0
|
552 |
|
553 |
-
# Determine assessment based on deviation and price reasonableness
|
554 |
-
if price_per_sqft <
|
555 |
assessment = "suspicious_pricing"
|
556 |
-
confidence = 0.1
|
557 |
-
elif price_per_sqft < market_avg * 0.
|
558 |
assessment = "below_market"
|
559 |
-
confidence = 0.3
|
560 |
-
elif price_per_sqft < market_avg * 0.
|
561 |
assessment = "below_market"
|
562 |
-
confidence = 0.6
|
563 |
-
elif price_per_sqft <= market_avg * 1.
|
564 |
assessment = "market_rate"
|
565 |
-
confidence = 0.8
|
566 |
-
elif price_per_sqft <= market_avg * 2.
|
567 |
assessment = "above_market"
|
568 |
-
confidence = 0.7
|
569 |
else: # Very above market
|
570 |
assessment = "premium_pricing"
|
571 |
-
confidence = 0.5
|
572 |
|
573 |
-
# Generate risk indicators
|
574 |
risk_indicators = []
|
575 |
-
if price_per_sqft < 100
|
576 |
risk_indicators.append("⚠️ Property priced extremely low (suspicious)")
|
577 |
-
elif price_per_sqft < market_avg * 0.3
|
578 |
risk_indicators.append("⚠️ Property priced significantly below market average")
|
579 |
-
elif price_per_sqft > market_avg * 2.0
|
580 |
risk_indicators.append("⚠️ Property priced significantly above market average")
|
581 |
|
582 |
-
# Price ranges for the city
|
583 |
price_ranges = {
|
584 |
'budget': {
|
585 |
-
'min': market_avg * 0.5
|
586 |
'max': market_avg * 0.8,
|
587 |
'description': f'Budget properties in {city}'
|
588 |
},
|
589 |
'mid_range': {
|
590 |
'min': market_avg * 0.8,
|
591 |
-
'max': market_avg * 1.2
|
592 |
'description': f'Mid-range properties in {city}'
|
593 |
},
|
594 |
'premium': {
|
595 |
-
'min': market_avg * 1.2
|
596 |
-
'max': market_avg * 2.0
|
597 |
'description': f'Premium properties in {city}'
|
598 |
}
|
599 |
}
|
|
|
550 |
else:
|
551 |
deviation = 0
|
552 |
|
553 |
+
# Determine assessment based on deviation and price reasonableness - Much more lenient
|
554 |
+
if price_per_sqft < 50: # Extremely low price - increased from 100
|
555 |
assessment = "suspicious_pricing"
|
556 |
+
confidence = 0.2 # Increased from 0.1
|
557 |
+
elif price_per_sqft < market_avg * 0.2: # Very below market - reduced from 0.3
|
558 |
assessment = "below_market"
|
559 |
+
confidence = 0.4 # Increased from 0.3
|
560 |
+
elif price_per_sqft < market_avg * 0.6: # Below market - reduced from 0.7
|
561 |
assessment = "below_market"
|
562 |
+
confidence = 0.7 # Increased from 0.6
|
563 |
+
elif price_per_sqft <= market_avg * 1.5: # Market rate - increased from 1.3
|
564 |
assessment = "market_rate"
|
565 |
+
confidence = 0.9 # Increased from 0.8
|
566 |
+
elif price_per_sqft <= market_avg * 2.5: # Above market - increased from 2.0
|
567 |
assessment = "above_market"
|
568 |
+
confidence = 0.8 # Increased from 0.7
|
569 |
else: # Very above market
|
570 |
assessment = "premium_pricing"
|
571 |
+
confidence = 0.6 # Increased from 0.5
|
572 |
|
573 |
+
# Generate risk indicators - Much more lenient
|
574 |
risk_indicators = []
|
575 |
+
if price_per_sqft < 50: # Increased from 100
|
576 |
risk_indicators.append("⚠️ Property priced extremely low (suspicious)")
|
577 |
+
elif price_per_sqft < market_avg * 0.2: # Reduced from 0.3
|
578 |
risk_indicators.append("⚠️ Property priced significantly below market average")
|
579 |
+
elif price_per_sqft > market_avg * 2.5: # Increased from 2.0
|
580 |
risk_indicators.append("⚠️ Property priced significantly above market average")
|
581 |
|
582 |
+
# Price ranges for the city - Much more lenient
|
583 |
price_ranges = {
|
584 |
'budget': {
|
585 |
+
'min': market_avg * 0.3, # Reduced from 0.5
|
586 |
'max': market_avg * 0.8,
|
587 |
'description': f'Budget properties in {city}'
|
588 |
},
|
589 |
'mid_range': {
|
590 |
'min': market_avg * 0.8,
|
591 |
+
'max': market_avg * 1.4, # Increased from 1.2
|
592 |
'description': f'Mid-range properties in {city}'
|
593 |
},
|
594 |
'premium': {
|
595 |
+
'min': market_avg * 1.4, # Reduced from 1.2
|
596 |
+
'max': market_avg * 2.5, # Increased from 2.0
|
597 |
'description': f'Premium properties in {city}'
|
598 |
}
|
599 |
}
|
models/trust_score.py
CHANGED
@@ -6,60 +6,63 @@ import re
|
|
6 |
|
7 |
def generate_trust_score(text, image_analysis, pdf_analysis):
|
8 |
try:
|
9 |
-
# Start with a
|
10 |
-
trust_score =
|
11 |
reasoning_parts = []
|
12 |
|
13 |
# Simple text-based trust indicators
|
14 |
text_lower = str(text).lower()
|
15 |
|
16 |
-
# CRITICAL: Check for obvious fake data patterns - but be less punitive
|
17 |
fake_patterns = [
|
18 |
-
r'
|
19 |
-
r'
|
20 |
-
r'
|
21 |
-
r'price.*\d{1,3}', # Very low prices
|
22 |
-
r'size.*\d{1,3}', # Very small sizes
|
23 |
-
r'bedrooms.*\d{1,2}', # Very few bedrooms
|
24 |
-
r'bathrooms.*\d{1,2}', # Very few bathrooms
|
25 |
]
|
26 |
|
27 |
fake_detected = False
|
28 |
for pattern in fake_patterns:
|
29 |
if re.search(pattern, text_lower):
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
numbers = re.findall(r'\b\d+\b', text_lower)
|
37 |
-
if len(numbers) >= 3
|
38 |
unique_numbers = set(numbers)
|
39 |
-
if len(unique_numbers) <=
|
40 |
fake_detected = True
|
41 |
-
trust_score -=
|
42 |
reasoning_parts.append("Detected repeated number patterns (likely fake data)")
|
43 |
|
44 |
-
# Check for extremely low values - but be less punitive
|
45 |
-
if any(word in text_lower for word in ['₹
|
46 |
fake_detected = True
|
47 |
-
trust_score -=
|
48 |
reasoning_parts.append("Detected suspiciously low pricing")
|
49 |
|
50 |
-
# Check for very small property sizes - but be less punitive
|
51 |
-
if any(word in text_lower for word in ['
|
52 |
fake_detected = True
|
53 |
-
trust_score -=
|
54 |
reasoning_parts.append("Detected suspiciously small property size")
|
55 |
|
56 |
-
# Positive trust indicators -
|
57 |
positive_indicators = [
|
58 |
'apartment', 'flat', 'house', 'villa', 'bungalow', 'property', 'real estate',
|
59 |
'bedroom', 'bathroom', 'kitchen', 'living', 'dining', 'balcony', 'parking',
|
60 |
'amenities', 'facilities', 'security', 'lift', 'gym', 'pool', 'garden',
|
61 |
'hyderabad', 'mumbai', 'delhi', 'bangalore', 'chennai', 'kolkata', 'pune',
|
62 |
-
'verified', 'authentic', 'genuine', 'legitimate', 'original', 'certified'
|
|
|
63 |
]
|
64 |
|
65 |
negative_indicators = [
|
@@ -70,82 +73,82 @@ def generate_trust_score(text, image_analysis, pdf_analysis):
|
|
70 |
positive_count = sum(1 for indicator in positive_indicators if indicator in text_lower)
|
71 |
negative_count = sum(1 for indicator in negative_indicators if indicator in text_lower)
|
72 |
|
73 |
-
# Adjust score based on indicators -
|
74 |
if positive_count > 0 and not fake_detected:
|
75 |
-
trust_score += min(
|
76 |
reasoning_parts.append(f"Found {positive_count} positive trust indicators")
|
77 |
|
78 |
if negative_count > 0:
|
79 |
-
trust_score -= min(
|
80 |
reasoning_parts.append(f"Found {negative_count} negative trust indicators")
|
81 |
|
82 |
-
# Image analysis contribution -
|
83 |
if image_analysis:
|
84 |
image_count = len(image_analysis) if isinstance(image_analysis, list) else 1
|
85 |
if image_count > 0:
|
86 |
# Check if images are actually property-related
|
87 |
property_related_count = sum(1 for img in image_analysis if img.get('is_property_related', False))
|
88 |
if property_related_count > 0:
|
89 |
-
trust_score += min(
|
90 |
reasoning_parts.append(f"Property has {property_related_count} property-related images")
|
91 |
else:
|
92 |
-
trust_score -=
|
93 |
reasoning_parts.append("No property-related images detected")
|
94 |
|
95 |
# Bonus for multiple high-quality images
|
96 |
if property_related_count >= 3:
|
97 |
-
trust_score +=
|
98 |
reasoning_parts.append("Multiple property images provided")
|
99 |
|
100 |
-
# PDF analysis contribution -
|
101 |
if pdf_analysis:
|
102 |
pdf_count = len(pdf_analysis) if isinstance(pdf_analysis, list) else 1
|
103 |
if pdf_count > 0:
|
104 |
# Check if documents are actually property-related
|
105 |
property_related_docs = sum(1 for doc in pdf_analysis if doc.get('is_property_related', False))
|
106 |
if property_related_docs > 0:
|
107 |
-
trust_score += min(
|
108 |
reasoning_parts.append(f"Property has {property_related_docs} property-related documents")
|
109 |
else:
|
110 |
-
trust_score -=
|
111 |
reasoning_parts.append("No property-related documents detected")
|
112 |
|
113 |
# Bonus for multiple documents
|
114 |
if property_related_docs >= 2:
|
115 |
-
trust_score +=
|
116 |
reasoning_parts.append("Multiple supporting documents provided")
|
117 |
|
118 |
-
# Text quality assessment -
|
119 |
if text and len(text) > 200 and not fake_detected:
|
120 |
-
trust_score +=
|
121 |
reasoning_parts.append("Detailed property description provided")
|
122 |
elif text and len(text) > 100 and not fake_detected:
|
123 |
-
trust_score +=
|
124 |
reasoning_parts.append("Adequate property description provided")
|
125 |
elif len(text) < 50:
|
126 |
-
trust_score -=
|
127 |
reasoning_parts.append("Very short property description")
|
128 |
|
129 |
-
# Location quality assessment -
|
130 |
if 'hyderabad' in text_lower or 'mumbai' in text_lower or 'delhi' in text_lower or 'bangalore' in text_lower:
|
131 |
if not fake_detected:
|
132 |
-
trust_score +=
|
133 |
reasoning_parts.append("Property in major city")
|
134 |
|
135 |
-
# Property type assessment -
|
136 |
-
if any(prop_type in text_lower for prop_type in ['apartment', 'flat', 'house', 'villa', 'bungalow']):
|
137 |
if not fake_detected:
|
138 |
-
trust_score +=
|
139 |
reasoning_parts.append("Clear property type mentioned")
|
140 |
|
141 |
-
# Amenities assessment -
|
142 |
amenities_count = sum(1 for amenity in ['pool', 'gym', 'garden', 'parking', 'security', 'lift', 'balcony']
|
143 |
if amenity in text_lower)
|
144 |
if amenities_count > 0 and not fake_detected:
|
145 |
-
trust_score += min(
|
146 |
reasoning_parts.append(f"Property has {amenities_count} amenities mentioned")
|
147 |
|
148 |
-
# CRITICAL: Additional fake data checks - but be less punitive
|
149 |
# Check if all major fields are just numbers
|
150 |
numeric_fields = ['property_name', 'bedrooms', 'bathrooms', 'sq_ft', 'market_value']
|
151 |
numeric_count = 0
|
@@ -153,14 +156,14 @@ def generate_trust_score(text, image_analysis, pdf_analysis):
|
|
153 |
if field in text_lower and re.search(r'\b\d{1,2}\b', text_lower):
|
154 |
numeric_count += 1
|
155 |
|
156 |
-
if numeric_count >=
|
157 |
fake_detected = True
|
158 |
-
trust_score -=
|
159 |
reasoning_parts.append("Multiple fields contain only numbers (highly suspicious)")
|
160 |
|
161 |
# Ensure minimum score for any valid data
|
162 |
-
if trust_score <
|
163 |
-
trust_score =
|
164 |
|
165 |
# Ensure score is within bounds
|
166 |
trust_score = max(0, min(100, trust_score))
|
@@ -175,4 +178,4 @@ def generate_trust_score(text, image_analysis, pdf_analysis):
|
|
175 |
|
176 |
except Exception as e:
|
177 |
logger.error(f"Error in trust score generation: {str(e)}")
|
178 |
-
return
|
|
|
6 |
|
7 |
def generate_trust_score(text, image_analysis, pdf_analysis):
|
8 |
try:
|
9 |
+
# Start with a much higher base score for legitimate properties
|
10 |
+
trust_score = 50.0 # Increased from 30.0 to 50.0 to give more reasonable starting point
|
11 |
reasoning_parts = []
|
12 |
|
13 |
# Simple text-based trust indicators
|
14 |
text_lower = str(text).lower()
|
15 |
|
16 |
+
# CRITICAL: Check for obvious fake data patterns - but be much less punitive
|
17 |
fake_patterns = [
|
18 |
+
r'^\d+$', # Only numbers (very strict)
|
19 |
+
r'price.*\d{1,2}', # Very low prices (more lenient)
|
20 |
+
r'size.*\d{1,2}', # Very small sizes (more lenient)
|
|
|
|
|
|
|
|
|
21 |
]
|
22 |
|
23 |
fake_detected = False
|
24 |
for pattern in fake_patterns:
|
25 |
if re.search(pattern, text_lower):
|
26 |
+
# Only mark as fake if it's extremely obvious
|
27 |
+
if pattern == r'^\d+$' and len(text.strip()) <= 3:
|
28 |
+
fake_detected = True
|
29 |
+
trust_score -= 10 # Reduced penalty from 15 to 10
|
30 |
+
reasoning_parts.append("Detected suspicious number patterns")
|
31 |
+
break
|
32 |
+
# For other patterns, be more lenient
|
33 |
+
elif pattern in [r'price.*\d{1,2}', r'size.*\d{1,2}']:
|
34 |
+
# Only mark as fake if multiple patterns are found
|
35 |
+
continue
|
36 |
+
|
37 |
+
# Check for repeated numbers (like "2, 2, 2, 2") - but be much less punitive
|
38 |
numbers = re.findall(r'\b\d+\b', text_lower)
|
39 |
+
if len(numbers) >= 5: # Increased threshold from 3 to 5
|
40 |
unique_numbers = set(numbers)
|
41 |
+
if len(unique_numbers) <= 1: # Only if ALL numbers are the same
|
42 |
fake_detected = True
|
43 |
+
trust_score -= 15 # Reduced penalty from 20 to 15
|
44 |
reasoning_parts.append("Detected repeated number patterns (likely fake data)")
|
45 |
|
46 |
+
# Check for extremely low values - but be much less punitive
|
47 |
+
if any(word in text_lower for word in ['₹1', '₹2']): # Only extremely low values
|
48 |
fake_detected = True
|
49 |
+
trust_score -= 20 # Reduced penalty from 25 to 20
|
50 |
reasoning_parts.append("Detected suspiciously low pricing")
|
51 |
|
52 |
+
# Check for very small property sizes - but be much less punitive
|
53 |
+
if any(word in text_lower for word in ['1 sq', '2 sq']): # Only extremely small
|
54 |
fake_detected = True
|
55 |
+
trust_score -= 15 # Reduced penalty from 20 to 15
|
56 |
reasoning_parts.append("Detected suspiciously small property size")
|
57 |
|
58 |
+
# Positive trust indicators - Much more generous
|
59 |
positive_indicators = [
|
60 |
'apartment', 'flat', 'house', 'villa', 'bungalow', 'property', 'real estate',
|
61 |
'bedroom', 'bathroom', 'kitchen', 'living', 'dining', 'balcony', 'parking',
|
62 |
'amenities', 'facilities', 'security', 'lift', 'gym', 'pool', 'garden',
|
63 |
'hyderabad', 'mumbai', 'delhi', 'bangalore', 'chennai', 'kolkata', 'pune',
|
64 |
+
'verified', 'authentic', 'genuine', 'legitimate', 'original', 'certified',
|
65 |
+
'pg', 'hostel', 'office', 'commercial', 'retail', 'warehouse', 'industrial'
|
66 |
]
|
67 |
|
68 |
negative_indicators = [
|
|
|
73 |
positive_count = sum(1 for indicator in positive_indicators if indicator in text_lower)
|
74 |
negative_count = sum(1 for indicator in negative_indicators if indicator in text_lower)
|
75 |
|
76 |
+
# Adjust score based on indicators - Much more balanced
|
77 |
if positive_count > 0 and not fake_detected:
|
78 |
+
trust_score += min(25, positive_count * 4) # Increased from 20 to 25
|
79 |
reasoning_parts.append(f"Found {positive_count} positive trust indicators")
|
80 |
|
81 |
if negative_count > 0:
|
82 |
+
trust_score -= min(20, negative_count * 4) # Reduced penalty from 25 to 20
|
83 |
reasoning_parts.append(f"Found {negative_count} negative trust indicators")
|
84 |
|
85 |
+
# Image analysis contribution - Much more balanced
|
86 |
if image_analysis:
|
87 |
image_count = len(image_analysis) if isinstance(image_analysis, list) else 1
|
88 |
if image_count > 0:
|
89 |
# Check if images are actually property-related
|
90 |
property_related_count = sum(1 for img in image_analysis if img.get('is_property_related', False))
|
91 |
if property_related_count > 0:
|
92 |
+
trust_score += min(20, property_related_count * 5) # Increased from 15 to 20
|
93 |
reasoning_parts.append(f"Property has {property_related_count} property-related images")
|
94 |
else:
|
95 |
+
trust_score -= 10 # Reduced penalty from 15 to 10
|
96 |
reasoning_parts.append("No property-related images detected")
|
97 |
|
98 |
# Bonus for multiple high-quality images
|
99 |
if property_related_count >= 3:
|
100 |
+
trust_score += 12 # Increased from 8 to 12
|
101 |
reasoning_parts.append("Multiple property images provided")
|
102 |
|
103 |
+
# PDF analysis contribution - Much more balanced
|
104 |
if pdf_analysis:
|
105 |
pdf_count = len(pdf_analysis) if isinstance(pdf_analysis, list) else 1
|
106 |
if pdf_count > 0:
|
107 |
# Check if documents are actually property-related
|
108 |
property_related_docs = sum(1 for doc in pdf_analysis if doc.get('is_property_related', False))
|
109 |
if property_related_docs > 0:
|
110 |
+
trust_score += min(20, property_related_docs * 6) # Increased from 15 to 20
|
111 |
reasoning_parts.append(f"Property has {property_related_docs} property-related documents")
|
112 |
else:
|
113 |
+
trust_score -= 8 # Reduced penalty from 10 to 8
|
114 |
reasoning_parts.append("No property-related documents detected")
|
115 |
|
116 |
# Bonus for multiple documents
|
117 |
if property_related_docs >= 2:
|
118 |
+
trust_score += 8 # Increased from 5 to 8
|
119 |
reasoning_parts.append("Multiple supporting documents provided")
|
120 |
|
121 |
+
# Text quality assessment - Much more balanced
|
122 |
if text and len(text) > 200 and not fake_detected:
|
123 |
+
trust_score += 15 # Increased from 12 to 15
|
124 |
reasoning_parts.append("Detailed property description provided")
|
125 |
elif text and len(text) > 100 and not fake_detected:
|
126 |
+
trust_score += 10 # Increased from 8 to 10
|
127 |
reasoning_parts.append("Adequate property description provided")
|
128 |
elif len(text) < 50:
|
129 |
+
trust_score -= 10 # Reduced penalty from 15 to 10
|
130 |
reasoning_parts.append("Very short property description")
|
131 |
|
132 |
+
# Location quality assessment - Much more balanced
|
133 |
if 'hyderabad' in text_lower or 'mumbai' in text_lower or 'delhi' in text_lower or 'bangalore' in text_lower:
|
134 |
if not fake_detected:
|
135 |
+
trust_score += 8 # Increased from 5 to 8
|
136 |
reasoning_parts.append("Property in major city")
|
137 |
|
138 |
+
# Property type assessment - Much more balanced
|
139 |
+
if any(prop_type in text_lower for prop_type in ['apartment', 'flat', 'house', 'villa', 'bungalow', 'pg', 'office']):
|
140 |
if not fake_detected:
|
141 |
+
trust_score += 6 # Increased from 4 to 6
|
142 |
reasoning_parts.append("Clear property type mentioned")
|
143 |
|
144 |
+
# Amenities assessment - Much more balanced
|
145 |
amenities_count = sum(1 for amenity in ['pool', 'gym', 'garden', 'parking', 'security', 'lift', 'balcony']
|
146 |
if amenity in text_lower)
|
147 |
if amenities_count > 0 and not fake_detected:
|
148 |
+
trust_score += min(12, amenities_count * 3) # Increased from 8 to 12
|
149 |
reasoning_parts.append(f"Property has {amenities_count} amenities mentioned")
|
150 |
|
151 |
+
# CRITICAL: Additional fake data checks - but be much less punitive
|
152 |
# Check if all major fields are just numbers
|
153 |
numeric_fields = ['property_name', 'bedrooms', 'bathrooms', 'sq_ft', 'market_value']
|
154 |
numeric_count = 0
|
|
|
156 |
if field in text_lower and re.search(r'\b\d{1,2}\b', text_lower):
|
157 |
numeric_count += 1
|
158 |
|
159 |
+
if numeric_count >= 4: # Increased threshold from 3 to 4
|
160 |
fake_detected = True
|
161 |
+
trust_score -= 25 # Reduced penalty from 30 to 25
|
162 |
reasoning_parts.append("Multiple fields contain only numbers (highly suspicious)")
|
163 |
|
164 |
# Ensure minimum score for any valid data
|
165 |
+
if trust_score < 20 and (image_analysis or pdf_analysis):
|
166 |
+
trust_score = 20 # Increased minimum score from 10 to 20
|
167 |
|
168 |
# Ensure score is within bounds
|
169 |
trust_score = max(0, min(100, trust_score))
|
|
|
178 |
|
179 |
except Exception as e:
|
180 |
logger.error(f"Error in trust score generation: {str(e)}")
|
181 |
+
return 35.0, f"Trust analysis failed: {str(e)}" # Increased from 20.0 to 35.0
|