sksameermujahid commited on
Commit
ebb3d5e
·
verified ·
1 Parent(s): 9860c76

Upload 22 files

Browse files
app.py CHANGED
@@ -244,7 +244,7 @@ def calculate_final_verdict(results):
244
  'confidence': 0.0,
245
  'reasoning': 'Insufficient data for verification',
246
  'risk_level': 'medium',
247
- 'overall_score': 25
248
  }
249
 
250
  # Extract key metrics with defensive programming
@@ -258,7 +258,7 @@ def calculate_final_verdict(results):
258
  specs_verification = results.get('specs_verification', {})
259
  quality_assessment = results.get('quality_assessment', {})
260
 
261
- # CRITICAL: Check for fake data patterns in cross validation
262
  fake_data_detected = False
263
  fraudulent_issues = 0
264
  high_severity_issues = 0
@@ -281,24 +281,24 @@ def calculate_final_verdict(results):
281
  elif severity == 'low':
282
  low_severity_issues += 1
283
 
284
- # Calculate fraud risk score - Much stricter
285
  fraud_score = 0.0
286
  fraud_level = fraud_classification.get('alert_level', 'minimal')
287
  fraud_alert_score = fraud_classification.get('alert_score', 0.0)
288
 
289
  fraud_score_mapping = {
290
- 'critical': 1.0, # Increased back to full penalty
291
- 'high': 0.8, # Increased back to full penalty
292
- 'medium': 0.6, # Increased back to full penalty
293
- 'low': 0.4, # Increased penalty
294
- 'minimal': 0.1 # Increased penalty
295
  }
296
- fraud_score = fraud_score_mapping.get(fraud_level, 0.1) * fraud_alert_score
297
 
298
- # CRITICAL: Heavy penalty for fake data
299
  if fake_data_detected:
300
- fraud_score = max(fraud_score, 0.8) # Minimum 80% fraud score for fake data
301
- fraud_level = 'high'
302
 
303
  # Calculate trust score
304
  trust_score = 0.0
@@ -315,9 +315,9 @@ def calculate_final_verdict(results):
315
  else:
316
  trust_score = 0.0
317
 
318
- # CRITICAL: Heavy penalty for fake data in trust score
319
  if fake_data_detected:
320
- trust_score = max(0.0, trust_score - 0.5) # Reduce trust score by 50% for fake data
321
 
322
  # Calculate address verification score
323
  address_score = 0.0
@@ -355,16 +355,16 @@ def calculate_final_verdict(results):
355
  score = quality_assessment.get('score', 0.0)
356
  quality_score = float(score) / 100.0 if score > 0 else 0.0
357
 
358
- # Much stricter weighted scoring system
359
  weights = {
360
- 'fraud': 0.35, # Increased weight for fraud detection
361
- 'trust': 0.25, # Keep trust score important
362
- 'address': 0.15, # Address verification
363
- 'location': 0.10, # Location analysis
364
- 'price': 0.10, # Price analysis
365
- 'legal': 0.03, # Legal analysis
366
- 'specs': 0.01, # Specs verification
367
- 'quality': 0.01 # Quality assessment
368
  }
369
 
370
  # Calculate weighted score
@@ -383,44 +383,44 @@ def calculate_final_verdict(results):
383
  logger.info(f"Score components: fraud={fraud_score:.3f}, trust={trust_score:.3f}, address={address_score:.3f}, location={location_score:.3f}, price={price_score:.3f}, legal={legal_score:.3f}, specs={specs_score:.3f}, quality={quality_score:.3f}")
384
  logger.info(f"Weighted score before penalty: {weighted_score:.3f}")
385
 
386
- # Much stricter penalty system
387
  issue_penalty = 0.0
388
  if fraudulent_issues > 0:
389
- issue_penalty += fraudulent_issues * 0.15 # 15% penalty per fraudulent issue
390
  if high_severity_issues > 0:
391
- issue_penalty += high_severity_issues * 0.10 # 10% penalty per high severity issue
392
  if medium_severity_issues > 0:
393
- issue_penalty += medium_severity_issues * 0.05 # 5% penalty per medium severity issue
394
  if low_severity_issues > 0:
395
- issue_penalty += low_severity_issues * 0.02 # 2% penalty per low severity issue
396
 
397
  weighted_score = max(0.0, weighted_score - issue_penalty)
398
 
399
  logger.info(f"Issue penalty: {issue_penalty:.3f}, Final weighted score: {weighted_score:.3f}")
400
 
401
- # CRITICAL: Much stricter minimum score requirements
402
  if fake_data_detected:
403
- weighted_score = max(0.05, weighted_score) # Maximum 5% score for fake data
404
  elif any([trust_score > 0, address_score > 0, location_score > 0, price_score > 0]):
405
- weighted_score = max(0.15, weighted_score) # Minimum 15% for any valid data
406
 
407
- # Less strict verdict determination
408
- if fake_data_detected or fraudulent_issues > 2:
409
  verdict = 'HIGH RISK LISTING'
410
  risk_level = 'high'
411
- elif weighted_score >= 0.70 and fraud_score < 0.3 and high_severity_issues == 0:
412
  verdict = 'VERIFIED REAL ESTATE LISTING'
413
  risk_level = 'low'
414
- elif weighted_score >= 0.50 and fraud_score < 0.4 and high_severity_issues <= 1:
415
  verdict = 'LIKELY LEGITIMATE'
416
  risk_level = 'low'
417
- elif weighted_score >= 0.30 and fraud_score < 0.6 and high_severity_issues <= 2:
418
  verdict = 'SUSPICIOUS LISTING'
419
  risk_level = 'medium'
420
- elif fraud_score >= 0.7 or weighted_score < 0.15 or high_severity_issues >= 4:
421
  verdict = 'HIGH RISK LISTING'
422
  risk_level = 'high'
423
- elif weighted_score >= 0.15:
424
  verdict = 'VERIFICATION REQUIRED'
425
  risk_level = 'medium'
426
  else:
@@ -436,22 +436,22 @@ def calculate_final_verdict(results):
436
  if fraudulent_issues > 0:
437
  reasoning_parts.append(f"{fraudulent_issues} fraudulent validation issues")
438
 
439
- if fraud_score > 0.3:
440
  reasoning_parts.append(f"Fraud risk detected (level: {fraud_level})")
441
 
442
- if trust_score < 0.3:
443
  reasoning_parts.append(f"Low trust score ({trust_score:.1%})")
444
 
445
- if address_score < 0.5:
446
  reasoning_parts.append("Address verification issues")
447
 
448
- if location_score < 0.5:
449
  reasoning_parts.append("Location verification issues")
450
 
451
- if price_score < 0.5:
452
  reasoning_parts.append("Price analysis concerns")
453
 
454
- if legal_score < 0.5:
455
  reasoning_parts.append("Legal documentation issues")
456
 
457
  if high_severity_issues > 0:
@@ -471,21 +471,21 @@ def calculate_final_verdict(results):
471
  # Ensure score is between 0 and 100
472
  overall_score = max(0, min(100, overall_score))
473
 
474
- # CRITICAL: Less punitive minimum score for fake data
475
  if fake_data_detected:
476
- overall_score = max(10, min(25, overall_score)) # 10-25% range for fake data
477
  elif overall_score == 0 and any([trust_score > 0, address_score > 0, location_score > 0]):
478
- overall_score = 20 # Minimum 20% score if any component is valid
479
 
480
- # Final score adjustment based on data quality - Less punitive
481
  if fake_data_detected or fraudulent_issues > 0:
482
- overall_score = max(10, min(25, overall_score)) # 10-25% for fake/fraudulent data
483
  elif high_severity_issues >= 3:
484
- overall_score = max(15, overall_score) # Minimum 15% for high risk
485
  elif high_severity_issues >= 1:
486
- overall_score = max(20, overall_score) # Minimum 20% for medium risk
487
  else:
488
- overall_score = max(25, overall_score) # Minimum 25% for low risk
489
 
490
  return {
491
  'verdict': verdict,
@@ -519,7 +519,7 @@ def calculate_final_verdict(results):
519
  'confidence': 0.0,
520
  'reasoning': f'Error in verdict calculation: {str(e)}',
521
  'risk_level': 'medium',
522
- 'overall_score': 25
523
  }
524
 
525
  @app.route('/verify', methods=['POST'])
 
244
  'confidence': 0.0,
245
  'reasoning': 'Insufficient data for verification',
246
  'risk_level': 'medium',
247
+ 'overall_score': 50 # Increased from 25
248
  }
249
 
250
  # Extract key metrics with defensive programming
 
258
  specs_verification = results.get('specs_verification', {})
259
  quality_assessment = results.get('quality_assessment', {})
260
 
261
+ # CRITICAL: Check for fake data patterns in cross validation - Much more lenient
262
  fake_data_detected = False
263
  fraudulent_issues = 0
264
  high_severity_issues = 0
 
281
  elif severity == 'low':
282
  low_severity_issues += 1
283
 
284
+ # Calculate fraud risk score - Much more lenient
285
  fraud_score = 0.0
286
  fraud_level = fraud_classification.get('alert_level', 'minimal')
287
  fraud_alert_score = fraud_classification.get('alert_score', 0.0)
288
 
289
  fraud_score_mapping = {
290
+ 'critical': 0.8, # Reduced from 1.0
291
+ 'high': 0.6, # Reduced from 0.8
292
+ 'medium': 0.4, # Reduced from 0.6
293
+ 'low': 0.2, # Reduced from 0.4
294
+ 'minimal': 0.05 # Reduced from 0.1
295
  }
296
+ fraud_score = fraud_score_mapping.get(fraud_level, 0.05) * fraud_alert_score
297
 
298
+ # CRITICAL: Much more lenient penalty for fake data
299
  if fake_data_detected:
300
+ fraud_score = max(fraud_score, 0.4) # Reduced from 0.8 to 0.4
301
+ fraud_level = 'medium' # Changed from 'high' to 'medium'
302
 
303
  # Calculate trust score
304
  trust_score = 0.0
 
315
  else:
316
  trust_score = 0.0
317
 
318
+ # CRITICAL: Much more lenient penalty for fake data in trust score
319
  if fake_data_detected:
320
+ trust_score = max(0.0, trust_score - 0.2) # Reduced penalty from 0.5 to 0.2
321
 
322
  # Calculate address verification score
323
  address_score = 0.0
 
355
  score = quality_assessment.get('score', 0.0)
356
  quality_score = float(score) / 100.0 if score > 0 else 0.0
357
 
358
+ # Much more balanced weighted scoring system
359
  weights = {
360
+ 'fraud': 0.25, # Reduced from 0.35
361
+ 'trust': 0.30, # Increased from 0.25
362
+ 'address': 0.15, # Keep address verification
363
+ 'location': 0.12, # Increased from 0.10
364
+ 'price': 0.10, # Keep price analysis
365
+ 'legal': 0.05, # Increased from 0.03
366
+ 'specs': 0.02, # Increased from 0.01
367
+ 'quality': 0.01 # Keep quality assessment
368
  }
369
 
370
  # Calculate weighted score
 
383
  logger.info(f"Score components: fraud={fraud_score:.3f}, trust={trust_score:.3f}, address={address_score:.3f}, location={location_score:.3f}, price={price_score:.3f}, legal={legal_score:.3f}, specs={specs_score:.3f}, quality={quality_score:.3f}")
384
  logger.info(f"Weighted score before penalty: {weighted_score:.3f}")
385
 
386
+ # Much more lenient penalty system
387
  issue_penalty = 0.0
388
  if fraudulent_issues > 0:
389
+ issue_penalty += fraudulent_issues * 0.08 # Reduced from 0.15 to 0.08
390
  if high_severity_issues > 0:
391
+ issue_penalty += high_severity_issues * 0.05 # Reduced from 0.10 to 0.05
392
  if medium_severity_issues > 0:
393
+ issue_penalty += medium_severity_issues * 0.02 # Reduced from 0.05 to 0.02
394
  if low_severity_issues > 0:
395
+ issue_penalty += low_severity_issues * 0.01 # Reduced from 0.02 to 0.01
396
 
397
  weighted_score = max(0.0, weighted_score - issue_penalty)
398
 
399
  logger.info(f"Issue penalty: {issue_penalty:.3f}, Final weighted score: {weighted_score:.3f}")
400
 
401
+ # CRITICAL: Much more lenient minimum score requirements
402
  if fake_data_detected:
403
+ weighted_score = max(0.15, weighted_score) # Increased from 0.05 to 0.15
404
  elif any([trust_score > 0, address_score > 0, location_score > 0, price_score > 0]):
405
+ weighted_score = max(0.30, weighted_score) # Increased from 0.15 to 0.30
406
 
407
+ # Much more lenient verdict determination
408
+ if fake_data_detected and fraudulent_issues > 5: # Increased threshold from 2 to 5
409
  verdict = 'HIGH RISK LISTING'
410
  risk_level = 'high'
411
+ elif weighted_score >= 0.60 and fraud_score < 0.4 and high_severity_issues == 0: # Reduced from 0.70 to 0.60
412
  verdict = 'VERIFIED REAL ESTATE LISTING'
413
  risk_level = 'low'
414
+ elif weighted_score >= 0.40 and fraud_score < 0.5 and high_severity_issues <= 2: # Reduced from 0.50 to 0.40
415
  verdict = 'LIKELY LEGITIMATE'
416
  risk_level = 'low'
417
+ elif weighted_score >= 0.25 and fraud_score < 0.7 and high_severity_issues <= 3: # Reduced from 0.30 to 0.25
418
  verdict = 'SUSPICIOUS LISTING'
419
  risk_level = 'medium'
420
+ elif fraud_score >= 0.8 or weighted_score < 0.20 or high_severity_issues >= 6: # Much more lenient thresholds
421
  verdict = 'HIGH RISK LISTING'
422
  risk_level = 'high'
423
+ elif weighted_score >= 0.20: # Reduced from 0.15
424
  verdict = 'VERIFICATION REQUIRED'
425
  risk_level = 'medium'
426
  else:
 
436
  if fraudulent_issues > 0:
437
  reasoning_parts.append(f"{fraudulent_issues} fraudulent validation issues")
438
 
439
+ if fraud_score > 0.4: # Reduced from 0.3
440
  reasoning_parts.append(f"Fraud risk detected (level: {fraud_level})")
441
 
442
+ if trust_score < 0.4: # Reduced from 0.3
443
  reasoning_parts.append(f"Low trust score ({trust_score:.1%})")
444
 
445
+ if address_score < 0.6: # Reduced from 0.5
446
  reasoning_parts.append("Address verification issues")
447
 
448
+ if location_score < 0.6: # Reduced from 0.5
449
  reasoning_parts.append("Location verification issues")
450
 
451
+ if price_score < 0.6: # Reduced from 0.5
452
  reasoning_parts.append("Price analysis concerns")
453
 
454
+ if legal_score < 0.6: # Reduced from 0.5
455
  reasoning_parts.append("Legal documentation issues")
456
 
457
  if high_severity_issues > 0:
 
471
  # Ensure score is between 0 and 100
472
  overall_score = max(0, min(100, overall_score))
473
 
474
+ # CRITICAL: Much more lenient minimum score for fake data
475
  if fake_data_detected:
476
+ overall_score = max(25, min(50, overall_score)) # Increased range from 10-25% to 25-50%
477
  elif overall_score == 0 and any([trust_score > 0, address_score > 0, location_score > 0]):
478
+ overall_score = 40 # Increased from 20 to 40
479
 
480
+ # Final score adjustment based on data quality - Much more lenient
481
  if fake_data_detected or fraudulent_issues > 0:
482
+ overall_score = max(25, min(50, overall_score)) # Increased from 10-25% to 25-50%
483
  elif high_severity_issues >= 3:
484
+ overall_score = max(30, overall_score) # Increased from 15 to 30
485
  elif high_severity_issues >= 1:
486
+ overall_score = max(40, overall_score) # Increased from 20 to 40
487
  else:
488
+ overall_score = max(50, overall_score) # Increased from 25 to 50
489
 
490
  return {
491
  'verdict': verdict,
 
519
  'confidence': 0.0,
520
  'reasoning': f'Error in verdict calculation: {str(e)}',
521
  'risk_level': 'medium',
522
+ 'overall_score': 50 # Increased from 25
523
  }
524
 
525
  @app.route('/verify', methods=['POST'])
models/address_verification.py CHANGED
@@ -28,20 +28,20 @@ def verify_address(data):
28
  latitude = data.get('latitude', None)
29
  longitude = data.get('longitude', None)
30
 
31
- # Basic validation - give points for having required fields
32
  basic_score = 0.0
33
  if zip_code:
34
- basic_score += 0.2
35
  if city:
36
- basic_score += 0.2
37
  if state:
38
- basic_score += 0.2
39
  if address:
40
- basic_score += 0.2
41
  if latitude and longitude:
42
- basic_score += 0.2
43
 
44
- # Pincode validation with fallback
45
  if zip_code:
46
  try:
47
  response = requests.get(f"https://api.postalpincode.in/pincode/{zip_code}", timeout=5)
@@ -62,18 +62,18 @@ def verify_address(data):
62
  address_results['issues'].append("Pincode API error")
63
  except Exception as e:
64
  logger.error(f"Pincode API error: {str(e)}")
65
- # Don't fail completely - give partial credit for having pincode
66
  if zip_code and len(zip_code) == 6 and zip_code.isdigit():
67
  address_results['pincode_valid'] = True
68
  address_results['issues'].append("Pincode validation failed (API error)")
69
  else:
70
  address_results['issues'].append("Pincode validation failed")
71
 
72
- # Geocoding with fallback
73
  full_address = ', '.join(filter(None, [address, city, state, country, zip_code]))
74
  geocoding_success = False
75
 
76
- for attempt in range(3):
77
  try:
78
  location = geocoder.geocode(full_address)
79
  if location:
@@ -87,7 +87,7 @@ def verify_address(data):
87
  geocoded_coords = (location.latitude, location.longitude)
88
  from geopy.distance import distance
89
  dist = distance(provided_coords, geocoded_coords).km
90
- address_results['coordinates_match'] = dist < 1.0
91
  if not address_results['coordinates_match']:
92
  address_results['issues'].append(f"Coordinates {dist:.2f}km off")
93
  except Exception as e:
@@ -100,15 +100,19 @@ def verify_address(data):
100
  time.sleep(1)
101
 
102
  if not geocoding_success:
103
- # Fallback: if we have basic address components, give partial credit
104
  if address and city and state:
105
  address_results['address_exists'] = True
106
- address_results['confidence'] = 0.6
 
 
 
 
107
  address_results['issues'].append("Address geocoding failed (using fallback validation)")
108
  else:
109
  address_results['issues'].append("Address geocoding failed")
110
 
111
- # Calculate verification score with fallback
112
  try:
113
  verification_points = (
114
  float(address_results['address_exists']) * 0.4 +
@@ -117,24 +121,30 @@ def verify_address(data):
117
  float(address_results['coordinates_match']) * 0.1
118
  )
119
 
120
- # If external APIs failed but we have basic data, give minimum score
121
  if verification_points == 0.0 and basic_score > 0.0:
122
- verification_points = basic_score * 0.5 # 50% of basic score as fallback
123
 
124
  except Exception as e:
125
  logger.warning(f"Error calculating verification points: {str(e)}")
126
- verification_points = basic_score * 0.5 # Fallback to basic score
127
 
128
- # Ensure minimum score for valid data
129
  if verification_points == 0.0 and (zip_code or city or state or address):
130
- verification_points = 0.2 # Minimum 20% for having some address data
 
 
 
 
 
 
131
 
132
  address_results['verification_score'] = verification_points * 100 # Convert to percentage
133
 
134
  return address_results
135
  except Exception as e:
136
  logger.error(f"Error verifying address: {str(e)}")
137
- # Return minimum score instead of 0
138
  return {
139
  'address_exists': False,
140
  'pincode_valid': False,
@@ -142,5 +152,5 @@ def verify_address(data):
142
  'coordinates_match': False,
143
  'confidence': 0.0,
144
  'issues': [f"Address verification error: {str(e)}"],
145
- 'verification_score': 10.0 # Minimum 10% score instead of 0
146
  }
 
28
  latitude = data.get('latitude', None)
29
  longitude = data.get('longitude', None)
30
 
31
+ # Basic validation - give more points for having required fields
32
  basic_score = 0.0
33
  if zip_code:
34
+ basic_score += 0.25 # Increased from 0.2
35
  if city:
36
+ basic_score += 0.25 # Increased from 0.2
37
  if state:
38
+ basic_score += 0.25 # Increased from 0.2
39
  if address:
40
+ basic_score += 0.15 # Reduced from 0.2 since address is less critical
41
  if latitude and longitude:
42
+ basic_score += 0.10 # Reduced from 0.2 since coordinates are optional
43
 
44
+ # Pincode validation with much more lenient fallback
45
  if zip_code:
46
  try:
47
  response = requests.get(f"https://api.postalpincode.in/pincode/{zip_code}", timeout=5)
 
62
  address_results['issues'].append("Pincode API error")
63
  except Exception as e:
64
  logger.error(f"Pincode API error: {str(e)}")
65
+ # Much more lenient fallback - give credit for having pincode
66
  if zip_code and len(zip_code) == 6 and zip_code.isdigit():
67
  address_results['pincode_valid'] = True
68
  address_results['issues'].append("Pincode validation failed (API error)")
69
  else:
70
  address_results['issues'].append("Pincode validation failed")
71
 
72
+ # Geocoding with much more lenient fallback
73
  full_address = ', '.join(filter(None, [address, city, state, country, zip_code]))
74
  geocoding_success = False
75
 
76
+ for attempt in range(2): # Reduced attempts from 3 to 2
77
  try:
78
  location = geocoder.geocode(full_address)
79
  if location:
 
87
  geocoded_coords = (location.latitude, location.longitude)
88
  from geopy.distance import distance
89
  dist = distance(provided_coords, geocoded_coords).km
90
+ address_results['coordinates_match'] = dist < 2.0 # Increased from 1.0 to 2.0
91
  if not address_results['coordinates_match']:
92
  address_results['issues'].append(f"Coordinates {dist:.2f}km off")
93
  except Exception as e:
 
100
  time.sleep(1)
101
 
102
  if not geocoding_success:
103
+ # Much more lenient fallback: if we have basic address components, give good credit
104
  if address and city and state:
105
  address_results['address_exists'] = True
106
+ address_results['confidence'] = 0.8 # Increased from 0.6
107
+ address_results['issues'].append("Address geocoding failed (using fallback validation)")
108
+ elif city and state: # Even more lenient - just city and state
109
+ address_results['address_exists'] = True
110
+ address_results['confidence'] = 0.7 # Good score for city/state
111
  address_results['issues'].append("Address geocoding failed (using fallback validation)")
112
  else:
113
  address_results['issues'].append("Address geocoding failed")
114
 
115
+ # Calculate verification score with much more lenient fallback
116
  try:
117
  verification_points = (
118
  float(address_results['address_exists']) * 0.4 +
 
121
  float(address_results['coordinates_match']) * 0.1
122
  )
123
 
124
+ # Much more lenient fallback scoring
125
  if verification_points == 0.0 and basic_score > 0.0:
126
+ verification_points = basic_score * 0.8 # Increased from 0.5 to 0.8
127
 
128
  except Exception as e:
129
  logger.warning(f"Error calculating verification points: {str(e)}")
130
+ verification_points = basic_score * 0.8 # Increased fallback to basic score
131
 
132
+ # Much more lenient minimum score for valid data
133
  if verification_points == 0.0 and (zip_code or city or state or address):
134
+ verification_points = 0.4 # Increased from 0.2 to 0.4 (40% minimum)
135
+
136
+ # Additional bonus for having multiple address components
137
+ if zip_code and city and state:
138
+ verification_points = min(1.0, verification_points + 0.1) # 10% bonus
139
+ elif city and state:
140
+ verification_points = min(1.0, verification_points + 0.05) # 5% bonus
141
 
142
  address_results['verification_score'] = verification_points * 100 # Convert to percentage
143
 
144
  return address_results
145
  except Exception as e:
146
  logger.error(f"Error verifying address: {str(e)}")
147
+ # Return much higher minimum score instead of 0
148
  return {
149
  'address_exists': False,
150
  'pincode_valid': False,
 
152
  'coordinates_match': False,
153
  'confidence': 0.0,
154
  'issues': [f"Address verification error: {str(e)}"],
155
+ 'verification_score': 30.0 # Increased from 10.0 to 30.0
156
  }
models/cross_validation.py CHANGED
@@ -460,9 +460,9 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
460
  fake_data_detected = False
461
  fake_indicators = []
462
 
463
- # Check for numeric-only property names
464
  property_name = data.get('property_name', '').strip()
465
- if property_name.isdigit() or property_name in ['1', '2', '3', '4', '5']:
466
  fake_data_detected = True
467
  fake_indicators.append("Property name is just a number")
468
  analysis_sections['basic_info'].append({
@@ -474,9 +474,9 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
474
  'recommendation': 'Provide a real property name'
475
  })
476
 
477
- # Check for suspiciously low values
478
  market_value = safe_float_convert(data.get('market_value', 0))
479
- if market_value <= 10: # Extremely low threshold
480
  fake_data_detected = True
481
  fake_indicators.append("Suspiciously low market value")
482
  analysis_sections['pricing'].append({
@@ -488,9 +488,9 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
488
  'recommendation': 'Provide realistic market value'
489
  })
490
 
491
- # Check for unrealistic property sizes
492
  square_feet = safe_float_convert(data.get('sq_ft', 0))
493
- if square_feet <= 10: # Extremely small
494
  fake_data_detected = True
495
  fake_indicators.append("Unrealistic property size")
496
  analysis_sections['specifications'].append({
@@ -502,7 +502,7 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
502
  'recommendation': 'Provide realistic property size'
503
  })
504
 
505
- # Check for repeated suspicious numbers
506
  all_values = [
507
  str(data.get('bedrooms', '')),
508
  str(data.get('bathrooms', '')),
@@ -514,9 +514,9 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
514
  ]
515
 
516
  numeric_values = [v for v in all_values if v.isdigit()]
517
- if len(numeric_values) >= 3:
518
  unique_values = set(numeric_values)
519
- if len(unique_values) <= 2: # Most values are the same
520
  fake_data_detected = True
521
  fake_indicators.append("Multiple fields have same suspicious values")
522
  analysis_sections['basic_info'].append({
@@ -611,34 +611,34 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
611
  'recommendation': 'Provide a valid postal code'
612
  })
613
 
614
- # Specifications validation - Handle flat data structure
615
  bedrooms = safe_int_convert(data.get('bedrooms', 0))
616
  bathrooms = safe_float_convert(data.get('bathrooms', 0))
617
  year_built = safe_int_convert(data.get('year_built', 0))
618
 
619
- # Much stricter validation ranges
620
- if bedrooms <= 0 or bedrooms > 20:
621
  analysis_sections['specifications'].append({
622
  'check': 'bedrooms',
623
- 'status': 'fraudulent' if bedrooms <= 0 else 'suspicious',
624
  'message': 'Unrealistic number of bedrooms.',
625
  'details': f'Bedrooms: {bedrooms}',
626
- 'severity': 'high' if bedrooms <= 0 else 'medium',
627
  'recommendation': 'Provide realistic bedroom count'
628
  })
629
 
630
- if bathrooms <= 0 or bathrooms > 15:
631
  analysis_sections['specifications'].append({
632
  'check': 'bathrooms',
633
- 'status': 'fraudulent' if bathrooms <= 0 else 'suspicious',
634
  'message': 'Unrealistic number of bathrooms.',
635
  'details': f'Bathrooms: {bathrooms}',
636
- 'severity': 'high' if bathrooms <= 0 else 'medium',
637
  'recommendation': 'Provide realistic bathroom count'
638
  })
639
 
640
  current_year = datetime.now().year
641
- if year_built > current_year or year_built < 1800:
642
  analysis_sections['specifications'].append({
643
  'check': 'year_built',
644
  'status': 'suspicious',
@@ -648,7 +648,7 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
648
  'recommendation': 'Provide realistic year built'
649
  })
650
 
651
- # Pricing validation - Handle flat data structure
652
  if market_value <= 0:
653
  analysis_sections['pricing'].append({
654
  'check': 'market_value',
@@ -658,21 +658,21 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
658
  'severity': 'high',
659
  'recommendation': 'Provide property market value'
660
  })
661
- elif market_value < 100000: # Minimum reasonable price
662
  analysis_sections['pricing'].append({
663
  'check': 'market_value',
664
- 'status': 'fraudulent' if market_value < 10000 else 'suspicious',
665
  'message': 'Unusually low market value.',
666
  'details': f'Market value: ₹{market_value:,.0f}',
667
- 'severity': 'high' if market_value < 10000 else 'medium',
668
  'recommendation': 'Verify market value is accurate'
669
  })
670
 
671
- # Description validation
672
  description = data.get('description', '').strip()
673
  if description:
674
- # Check for fake description patterns
675
- if description.isdigit() or description in ['1', '2', '3', '4', '5']:
676
  fake_data_detected = True
677
  fake_indicators.append("Description is just a number")
678
  analysis_sections['description'].append({
@@ -683,7 +683,7 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
683
  'severity': 'high',
684
  'recommendation': 'Provide a real property description'
685
  })
686
- elif len(description) < 50:
687
  analysis_sections['description'].append({
688
  'check': 'description',
689
  'status': 'insufficient',
 
460
  fake_data_detected = False
461
  fake_indicators = []
462
 
463
+ # Check for numeric-only property names - Much more lenient
464
  property_name = data.get('property_name', '').strip()
465
+ if property_name.isdigit() and len(property_name) <= 2: # Only single/double digits
466
  fake_data_detected = True
467
  fake_indicators.append("Property name is just a number")
468
  analysis_sections['basic_info'].append({
 
474
  'recommendation': 'Provide a real property name'
475
  })
476
 
477
+ # Check for suspiciously low values - Much more lenient
478
  market_value = safe_float_convert(data.get('market_value', 0))
479
+ if market_value <= 5: # Extremely low threshold - only for obvious fake data
480
  fake_data_detected = True
481
  fake_indicators.append("Suspiciously low market value")
482
  analysis_sections['pricing'].append({
 
488
  'recommendation': 'Provide realistic market value'
489
  })
490
 
491
+ # Check for unrealistic property sizes - Much more lenient
492
  square_feet = safe_float_convert(data.get('sq_ft', 0))
493
+ if square_feet <= 5: # Extremely small - only for obvious fake data
494
  fake_data_detected = True
495
  fake_indicators.append("Unrealistic property size")
496
  analysis_sections['specifications'].append({
 
502
  'recommendation': 'Provide realistic property size'
503
  })
504
 
505
+ # Check for repeated suspicious numbers - Much more lenient
506
  all_values = [
507
  str(data.get('bedrooms', '')),
508
  str(data.get('bathrooms', '')),
 
514
  ]
515
 
516
  numeric_values = [v for v in all_values if v.isdigit()]
517
+ if len(numeric_values) >= 5: # Increased threshold from 3 to 5
518
  unique_values = set(numeric_values)
519
+ if len(unique_values) <= 1: # Only if ALL values are the same
520
  fake_data_detected = True
521
  fake_indicators.append("Multiple fields have same suspicious values")
522
  analysis_sections['basic_info'].append({
 
611
  'recommendation': 'Provide a valid postal code'
612
  })
613
 
614
+ # Specifications validation - Handle flat data structure - Much more lenient
615
  bedrooms = safe_int_convert(data.get('bedrooms', 0))
616
  bathrooms = safe_float_convert(data.get('bathrooms', 0))
617
  year_built = safe_int_convert(data.get('year_built', 0))
618
 
619
+ # Much more lenient validation ranges
620
+ if bedrooms < 0 or bedrooms > 50: # Increased range from 20 to 50
621
  analysis_sections['specifications'].append({
622
  'check': 'bedrooms',
623
+ 'status': 'fraudulent' if bedrooms < 0 else 'suspicious',
624
  'message': 'Unrealistic number of bedrooms.',
625
  'details': f'Bedrooms: {bedrooms}',
626
+ 'severity': 'high' if bedrooms < 0 else 'medium',
627
  'recommendation': 'Provide realistic bedroom count'
628
  })
629
 
630
+ if bathrooms < 0 or bathrooms > 30: # Increased range from 15 to 30
631
  analysis_sections['specifications'].append({
632
  'check': 'bathrooms',
633
+ 'status': 'fraudulent' if bathrooms < 0 else 'suspicious',
634
  'message': 'Unrealistic number of bathrooms.',
635
  'details': f'Bathrooms: {bathrooms}',
636
+ 'severity': 'high' if bathrooms < 0 else 'medium',
637
  'recommendation': 'Provide realistic bathroom count'
638
  })
639
 
640
  current_year = datetime.now().year
641
+ if year_built > current_year + 5 or year_built < 1800: # More lenient future year
642
  analysis_sections['specifications'].append({
643
  'check': 'year_built',
644
  'status': 'suspicious',
 
648
  'recommendation': 'Provide realistic year built'
649
  })
650
 
651
+ # Pricing validation - Handle flat data structure - Much more lenient
652
  if market_value <= 0:
653
  analysis_sections['pricing'].append({
654
  'check': 'market_value',
 
658
  'severity': 'high',
659
  'recommendation': 'Provide property market value'
660
  })
661
+ elif market_value < 10000: # Much more lenient minimum price
662
  analysis_sections['pricing'].append({
663
  'check': 'market_value',
664
+ 'status': 'fraudulent' if market_value < 1000 else 'suspicious',
665
  'message': 'Unusually low market value.',
666
  'details': f'Market value: ₹{market_value:,.0f}',
667
+ 'severity': 'high' if market_value < 1000 else 'medium',
668
  'recommendation': 'Verify market value is accurate'
669
  })
670
 
671
+ # Description validation - Much more lenient
672
  description = data.get('description', '').strip()
673
  if description:
674
+ # Check for fake description patterns - Much more lenient
675
+ if description.isdigit() and len(description) <= 2: # Only single/double digits
676
  fake_data_detected = True
677
  fake_indicators.append("Description is just a number")
678
  analysis_sections['description'].append({
 
683
  'severity': 'high',
684
  'recommendation': 'Provide a real property description'
685
  })
686
+ elif len(description) < 30: # Reduced from 50 to 30
687
  analysis_sections['description'].append({
688
  'check': 'description',
689
  'status': 'insufficient',
models/fraud_classification.py CHANGED
@@ -12,53 +12,55 @@ def classify_fraud(property_details, description):
12
  # Combine property details and description for analysis
13
  text_to_analyze = f"{property_details} {description}"
14
 
15
- # CRITICAL: Check for obvious fake data patterns first
16
  fake_patterns = [
17
- r'\b\d+\s*$', # Numbers at end of lines
18
- r'^\d+$', # Only numbers
19
- r'\b\d{1,2}\s*$', # Single or double digits
20
- r'price.*\d{1,3}', # Very low prices
21
- r'size.*\d{1,3}', # Very small sizes
22
- r'bedrooms.*\d{1,2}', # Very few bedrooms
23
- r'bathrooms.*\d{1,2}', # Very few bathrooms
24
  ]
25
 
26
  fake_detected = False
27
  for pattern in fake_patterns:
28
  if re.search(pattern, text_to_analyze.lower()):
29
- fake_detected = True
30
- break
31
-
32
- # Check for repeated numbers (like "2, 2, 2, 2")
 
 
 
 
 
 
33
  numbers = re.findall(r'\b\d+\b', text_to_analyze.lower())
34
- if len(numbers) >= 3:
35
  unique_numbers = set(numbers)
36
- if len(unique_numbers) <= 2: # If most numbers are the same
37
  fake_detected = True
38
 
39
- # Check for extremely low values
40
- if any(word in text_to_analyze.lower() for word in ['₹2', '₹1', '₹3', '₹4', '₹5']):
41
  fake_detected = True
42
 
43
- # Check for very small property sizes
44
- if any(word in text_to_analyze.lower() for word in ['2 sq ft', '1 sq ft', '3 sq ft', '4 sq ft', '5 sq ft']):
45
  fake_detected = True
46
 
47
- # If fake data is detected, return high fraud score immediately
48
  if fake_detected:
49
  return {
50
- 'alert_level': 'high',
51
- 'alert_score': 0.9, # 90% fraud score for fake data
52
  'confidence_scores': {
53
- 'high risk listing': 0.9,
54
- 'potential fraud': 0.8,
55
- 'suspicious listing': 0.7,
56
- 'legitimate listing': 0.1
57
  },
58
  'high_risk': ['Fake data patterns detected'],
59
  'medium_risk': [],
60
  'low_risk': [],
61
- 'reasoning': 'This property was classified as high risk due to detected fake data patterns (repeated numbers, suspiciously low values, unrealistic specifications).'
62
  }
63
 
64
  # Use a more lenient classification approach for legitimate-looking data
@@ -85,41 +87,41 @@ def classify_fraud(property_details, description):
85
  'reasoning': ''
86
  }
87
 
88
- # Process classification results - More lenient for legitimate data
89
  fraud_score = 0.0
90
  if isinstance(result, dict) and 'scores' in result:
91
  for label, score in zip(result.get('labels', []), result.get('scores', [])):
92
  if label != "legitimate listing":
93
  try:
94
  score_val = float(score)
95
- # Reduce the impact of suspicious classifications
96
  if label == "suspicious listing":
97
- score_val *= 0.5 # Reduce suspicious impact by 50%
98
  elif label == "potential fraud":
99
- score_val *= 0.7 # Reduce potential fraud impact by 30%
100
  elif label == "high risk listing":
101
- score_val *= 0.8 # Reduce high risk impact by 20%
102
  except Exception:
103
  score_val = 0.0
104
  fraud_score += score_val
105
  fraud_classification['confidence_scores'][label] = score_val
106
  else:
107
  # Handle fallback result
108
- fraud_score = 0.05 # Reduced from 0.1 to 0.05
109
 
110
- # Normalize fraud score to 0-1 range with more lenient scaling
111
  try:
112
- fraud_score = min(1.0, fraud_score / (len(risk_categories) - 1) * 0.7) # Reduced by 30%
113
  except Exception:
114
  fraud_score = 0.0
115
  fraud_classification['alert_score'] = fraud_score
116
 
117
- # Determine alert level with more lenient thresholds
118
- if fraud_score >= 0.7: # Increased from 0.6
119
  fraud_classification['alert_level'] = 'high'
120
- elif fraud_score >= 0.4: # Increased from 0.3
121
  fraud_classification['alert_level'] = 'medium'
122
- elif fraud_score >= 0.2: # Increased from 0.1
123
  fraud_classification['alert_level'] = 'low'
124
  else:
125
  fraud_classification['alert_level'] = 'minimal'
@@ -127,11 +129,11 @@ def classify_fraud(property_details, description):
127
  # Generate reasoning based on scores
128
  reasoning_parts = []
129
 
130
- if fraud_score < 0.2:
131
  reasoning_parts.append("This property was classified as legitimate based on AI analysis of the listing details.")
132
- elif fraud_score < 0.4:
133
  reasoning_parts.append("This property was classified as low risk based on AI analysis of the listing details.")
134
- elif fraud_score < 0.7:
135
  reasoning_parts.append("This property was classified as medium risk based on AI analysis of the listing details.")
136
  else:
137
  reasoning_parts.append("This property was classified as high risk based on AI analysis of the listing details.")
@@ -139,7 +141,7 @@ def classify_fraud(property_details, description):
139
  # Add specific risk indicators if any
140
  if fraud_classification['confidence_scores']:
141
  highest_risk = max(fraud_classification['confidence_scores'].items(), key=lambda x: x[1])
142
- if highest_risk[1] > 0.3:
143
  reasoning_parts.append(f"Primary concern: {highest_risk[0]} (confidence: {highest_risk[1]:.0%})")
144
 
145
  fraud_classification['reasoning'] = " ".join(reasoning_parts)
@@ -150,7 +152,7 @@ def classify_fraud(property_details, description):
150
  logger.error(f"Error in fraud classification: {str(e)}")
151
  return {
152
  'alert_level': 'minimal',
153
- 'alert_score': 0.05, # Reduced from 0.1
154
  'confidence_scores': {},
155
  'high_risk': [],
156
  'medium_risk': [],
 
12
  # Combine property details and description for analysis
13
  text_to_analyze = f"{property_details} {description}"
14
 
15
+ # CRITICAL: Check for obvious fake data patterns first - Much more lenient
16
  fake_patterns = [
17
+ r'^\d+$', # Only numbers (very strict)
18
+ r'price.*\d{1,2}', # Very low prices (more lenient)
19
+ r'size.*\d{1,2}', # Very small sizes (more lenient)
 
 
 
 
20
  ]
21
 
22
  fake_detected = False
23
  for pattern in fake_patterns:
24
  if re.search(pattern, text_to_analyze.lower()):
25
+ # Only mark as fake if it's extremely obvious
26
+ if pattern == r'^\d+$' and len(text_to_analyze.strip()) <= 3:
27
+ fake_detected = True
28
+ break
29
+ # For other patterns, be more lenient
30
+ elif pattern in [r'price.*\d{1,2}', r'size.*\d{1,2}']:
31
+ # Only mark as fake if multiple patterns are found
32
+ continue
33
+
34
+ # Check for repeated numbers (like "2, 2, 2, 2") - Much more lenient
35
  numbers = re.findall(r'\b\d+\b', text_to_analyze.lower())
36
+ if len(numbers) >= 5: # Increased threshold from 3 to 5
37
  unique_numbers = set(numbers)
38
+ if len(unique_numbers) <= 1: # Only if ALL numbers are the same
39
  fake_detected = True
40
 
41
+ # Check for extremely low values - Much more lenient
42
+ if any(word in text_to_analyze.lower() for word in ['₹1', '₹2']): # Only extremely low values
43
  fake_detected = True
44
 
45
+ # Check for very small property sizes - Much more lenient
46
+ if any(word in text_to_analyze.lower() for word in ['1 sq ft', '2 sq ft']): # Only extremely small
47
  fake_detected = True
48
 
49
+ # If fake data is detected, return moderate fraud score instead of high
50
  if fake_detected:
51
  return {
52
+ 'alert_level': 'medium', # Changed from 'high' to 'medium'
53
+ 'alert_score': 0.6, # Reduced from 0.9 to 0.6
54
  'confidence_scores': {
55
+ 'high risk listing': 0.6, # Reduced from 0.9
56
+ 'potential fraud': 0.5, # Reduced from 0.8
57
+ 'suspicious listing': 0.4, # Reduced from 0.7
58
+ 'legitimate listing': 0.2 # Increased from 0.1
59
  },
60
  'high_risk': ['Fake data patterns detected'],
61
  'medium_risk': [],
62
  'low_risk': [],
63
+ 'reasoning': 'This property was classified as medium risk due to detected fake data patterns.'
64
  }
65
 
66
  # Use a more lenient classification approach for legitimate-looking data
 
87
  'reasoning': ''
88
  }
89
 
90
+ # Process classification results - Much more lenient for legitimate data
91
  fraud_score = 0.0
92
  if isinstance(result, dict) and 'scores' in result:
93
  for label, score in zip(result.get('labels', []), result.get('scores', [])):
94
  if label != "legitimate listing":
95
  try:
96
  score_val = float(score)
97
+ # Much more lenient reduction of suspicious classifications
98
  if label == "suspicious listing":
99
+ score_val *= 0.3 # Reduced from 0.5 to 0.3
100
  elif label == "potential fraud":
101
+ score_val *= 0.5 # Reduced from 0.7 to 0.5
102
  elif label == "high risk listing":
103
+ score_val *= 0.6 # Reduced from 0.8 to 0.6
104
  except Exception:
105
  score_val = 0.0
106
  fraud_score += score_val
107
  fraud_classification['confidence_scores'][label] = score_val
108
  else:
109
  # Handle fallback result
110
+ fraud_score = 0.02 # Reduced from 0.05 to 0.02
111
 
112
+ # Normalize fraud score to 0-1 range with much more lenient scaling
113
  try:
114
+ fraud_score = min(1.0, fraud_score / (len(risk_categories) - 1) * 0.5) # Reduced by 50%
115
  except Exception:
116
  fraud_score = 0.0
117
  fraud_classification['alert_score'] = fraud_score
118
 
119
+ # Determine alert level with much more lenient thresholds
120
+ if fraud_score >= 0.8: # Increased from 0.7
121
  fraud_classification['alert_level'] = 'high'
122
+ elif fraud_score >= 0.5: # Increased from 0.4
123
  fraud_classification['alert_level'] = 'medium'
124
+ elif fraud_score >= 0.3: # Increased from 0.2
125
  fraud_classification['alert_level'] = 'low'
126
  else:
127
  fraud_classification['alert_level'] = 'minimal'
 
129
  # Generate reasoning based on scores
130
  reasoning_parts = []
131
 
132
+ if fraud_score < 0.3:
133
  reasoning_parts.append("This property was classified as legitimate based on AI analysis of the listing details.")
134
+ elif fraud_score < 0.5:
135
  reasoning_parts.append("This property was classified as low risk based on AI analysis of the listing details.")
136
+ elif fraud_score < 0.8:
137
  reasoning_parts.append("This property was classified as medium risk based on AI analysis of the listing details.")
138
  else:
139
  reasoning_parts.append("This property was classified as high risk based on AI analysis of the listing details.")
 
141
  # Add specific risk indicators if any
142
  if fraud_classification['confidence_scores']:
143
  highest_risk = max(fraud_classification['confidence_scores'].items(), key=lambda x: x[1])
144
+ if highest_risk[1] > 0.4: # Increased threshold from 0.3 to 0.4
145
  reasoning_parts.append(f"Primary concern: {highest_risk[0]} (confidence: {highest_risk[1]:.0%})")
146
 
147
  fraud_classification['reasoning'] = " ".join(reasoning_parts)
 
152
  logger.error(f"Error in fraud classification: {str(e)}")
153
  return {
154
  'alert_level': 'minimal',
155
+ 'alert_score': 0.02, # Reduced from 0.05 to 0.02
156
  'confidence_scores': {},
157
  'high_risk': [],
158
  'medium_risk': [],
models/location_analysis.py CHANGED
@@ -15,39 +15,38 @@ def validate_address_format(address: str) -> bool:
15
  if not address:
16
  return False
17
 
18
- # Check for minimum length
19
- if len(address.strip()) < 10: # Minimum reasonable length for an address
20
  return False
21
 
22
- # Check for minimum components
23
  components = [comp.strip() for comp in address.split(',')]
24
- if len(components) < 2: # At least area and city
25
  return False
26
 
27
- # Check for common address patterns
28
  patterns = [
29
- r'\d+', # Should contain numbers
30
- r'[A-Za-z\s]+', # Should contain letters
31
- r'(?:street|road|avenue|lane|colony|society|apartment|flat|house|building|plot|block|sector|phase|floor|wing|area|locality|main|cross|circle|square|market|ward|zone|mandal|municipal|corporation|greater)', # Common address terms
32
  ]
33
 
34
- # Check if at least 2 patterns match
35
  pattern_matches = sum(1 for pattern in patterns if re.search(pattern, address.lower()))
36
- if pattern_matches < 2:
37
  return False
38
 
39
- # Check for common address components
40
  address_lower = address.lower()
41
  has_location = any(term in address_lower for term in [
42
  'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater',
43
- 'street', 'road', 'avenue', 'lane', 'colony', 'society'
44
  ])
45
  has_area = any(term in address_lower for term in [
46
  'colony', 'society', 'apartment', 'flat', 'house', 'plot', 'block', 'sector',
47
- 'area', 'locality', 'main', 'cross', 'circle', 'square', 'market'
48
  ])
49
 
50
- return has_location or has_area
 
51
 
52
  def validate_postal_code(postal_code: str) -> bool:
53
  """Validate Indian postal code format."""
@@ -57,13 +56,13 @@ def validate_postal_code(postal_code: str) -> bool:
57
  # Remove any spaces and convert to string
58
  postal_code = str(postal_code).strip().replace(' ', '')
59
 
60
- # Check format
61
- if not re.match(r'^\d{6}$', postal_code):
62
  return False
63
 
64
- # Validate first digit (region)
65
  first_digit = int(postal_code[0])
66
- if first_digit not in range(1, 9): # India has 8 postal regions
67
  return False
68
 
69
  return True
@@ -75,12 +74,12 @@ def validate_coordinates(latitude: str, longitude: str) -> bool:
75
  lat = float(str(latitude).strip())
76
  lng = float(str(longitude).strip())
77
 
78
- # India's approximate boundaries with some buffer
79
  india_bounds = {
80
- 'lat_min': 6.0, # Slightly expanded for coastal areas
81
- 'lat_max': 38.0, # Slightly expanded for northern regions
82
- 'lng_min': 67.0, # Slightly expanded for western regions
83
- 'lng_max': 98.0 # Slightly expanded for eastern regions
84
  }
85
 
86
  # Check if coordinates are within India's boundaries
@@ -88,12 +87,12 @@ def validate_coordinates(latitude: str, longitude: str) -> bool:
88
  india_bounds['lng_min'] <= lng <= india_bounds['lng_max']):
89
  return False
90
 
91
- # Check for reasonable precision (no more than 6 decimal places)
92
- lat_str = f"{lat:.6f}"
93
- lng_str = f"{lng:.6f}"
94
 
95
- # Check if the original values match the formatted values
96
- if abs(float(lat_str) - lat) > 0.000001 or abs(float(lng_str) - lng) > 0.000001:
97
  return False
98
 
99
  return True
@@ -376,28 +375,31 @@ def analyze_location(data: Dict[str, Any]) -> Dict[str, Any]:
376
  data.get('city', '')
377
  )
378
  }
379
- # Calculate weighted completeness score with adjusted weights
380
  weights = {
381
- 'address_format_valid': 0.15,
382
- 'address_in_city': 0.20, # Increased weight for address verification
383
- 'city_in_state': 0.10,
384
- 'state_in_country': 0.10,
385
- 'postal_code_valid': 0.10,
386
- 'postal_code_in_city': 0.10,
387
- 'coordinates_valid': 0.10,
388
- 'coordinates_in_city': 0.15
389
  }
390
  completeness_score = sum(
391
  weights[key] * 100 if result else 0
392
  for key, result in verification_results.items()
393
  )
394
- # Determine location quality with more lenient criteria
395
- critical_checks = ['address_format_valid', 'city_in_state', 'state_in_country', 'postal_code_valid']
396
- secondary_checks = ['address_in_city', 'postal_code_in_city', 'coordinates_valid', 'coordinates_in_city']
397
- # Location is verified if all critical checks pass and at least 2 secondary checks pass
 
 
398
  critical_passed = all(verification_results[check] for check in critical_checks)
399
  secondary_passed = sum(1 for check in secondary_checks if verification_results[check])
400
- location_quality = "verified" if critical_passed and secondary_passed >= 2 else "unverified"
 
401
  # Analyze landmarks
402
  landmarks_analysis = {
403
  'provided': bool(data.get('nearby_landmarks')),
@@ -419,6 +421,7 @@ def analyze_location(data: Dict[str, Any]) -> Dict[str, Any]:
419
  if any(keyword in landmark for keyword in keywords):
420
  if type_name not in landmarks_analysis['types']:
421
  landmarks_analysis['types'].append(type_name)
 
422
  # Determine city tier
423
  city_tier = "unknown"
424
  if data.get('city'):
@@ -433,9 +436,22 @@ def analyze_location(data: Dict[str, Any]) -> Dict[str, Any]:
433
  city_tier = "tier2"
434
  else:
435
  city_tier = "tier3"
 
 
 
 
 
 
 
 
 
 
 
 
 
436
  return {
437
  **verification_results,
438
- 'assessment': "complete" if completeness_score >= 80 else "partial" if completeness_score >= 50 else "minimal",
439
  'completeness_score': completeness_score,
440
  'location_quality': location_quality,
441
  'city_tier': city_tier,
@@ -447,7 +463,7 @@ def analyze_location(data: Dict[str, Any]) -> Dict[str, Any]:
447
  logger.error(f"Error analyzing location: {str(e)}")
448
  return {
449
  'assessment': 'error',
450
- 'completeness_score': 0,
451
  'location_quality': 'error',
452
  'city_tier': 'unknown',
453
  'landmarks_analysis': {'provided': False, 'count': 0, 'types': []},
 
15
  if not address:
16
  return False
17
 
18
+ # Much more lenient minimum length
19
+ if len(address.strip()) < 5: # Reduced from 10 to 5
20
  return False
21
 
22
+ # Much more lenient component check
23
  components = [comp.strip() for comp in address.split(',')]
24
+ if len(components) < 1: # Reduced from 2 to 1 - just need some address
25
  return False
26
 
27
+ # Much more lenient pattern matching
28
  patterns = [
29
+ r'[A-Za-z\s]+', # Should contain letters (most important)
 
 
30
  ]
31
 
32
+ # Check if at least 1 pattern matches (reduced from 2)
33
  pattern_matches = sum(1 for pattern in patterns if re.search(pattern, address.lower()))
34
+ if pattern_matches < 1: # Reduced from 2 to 1
35
  return False
36
 
37
+ # Much more lenient address component check
38
  address_lower = address.lower()
39
  has_location = any(term in address_lower for term in [
40
  'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater',
41
+ 'street', 'road', 'avenue', 'lane', 'colony', 'society', 'area', 'near'
42
  ])
43
  has_area = any(term in address_lower for term in [
44
  'colony', 'society', 'apartment', 'flat', 'house', 'plot', 'block', 'sector',
45
+ 'area', 'locality', 'main', 'cross', 'circle', 'square', 'market', 'near'
46
  ])
47
 
48
+ # Much more lenient - return True if either condition is met or if address has reasonable length
49
+ return has_location or has_area or len(address.strip()) >= 8 # Added length-based validation
50
 
51
  def validate_postal_code(postal_code: str) -> bool:
52
  """Validate Indian postal code format."""
 
56
  # Remove any spaces and convert to string
57
  postal_code = str(postal_code).strip().replace(' ', '')
58
 
59
+ # Much more lenient format check
60
+ if not re.match(r'^\d{5,6}$', postal_code): # Allow 5-6 digits instead of exactly 6
61
  return False
62
 
63
+ # Much more lenient first digit validation
64
  first_digit = int(postal_code[0])
65
+ if first_digit not in range(0, 10): # Allow 0-9 instead of 1-8
66
  return False
67
 
68
  return True
 
74
  lat = float(str(latitude).strip())
75
  lng = float(str(longitude).strip())
76
 
77
+ # Much more lenient India boundaries with larger buffer
78
  india_bounds = {
79
+ 'lat_min': 5.0, # Reduced from 6.0
80
+ 'lat_max': 40.0, # Increased from 38.0
81
+ 'lng_min': 65.0, # Reduced from 67.0
82
+ 'lng_max': 100.0 # Increased from 98.0
83
  }
84
 
85
  # Check if coordinates are within India's boundaries
 
87
  india_bounds['lng_min'] <= lng <= india_bounds['lng_max']):
88
  return False
89
 
90
+ # Much more lenient precision check
91
+ lat_str = f"{lat:.4f}" # Reduced from 6 to 4 decimal places
92
+ lng_str = f"{lng:.4f}" # Reduced from 6 to 4 decimal places
93
 
94
+ # Much more lenient precision validation
95
+ if abs(float(lat_str) - lat) > 0.0001 or abs(float(lng_str) - lng) > 0.0001: # Increased tolerance
96
  return False
97
 
98
  return True
 
375
  data.get('city', '')
376
  )
377
  }
378
+ # Calculate weighted completeness score with much more lenient weights
379
  weights = {
380
+ 'address_format_valid': 0.10, # Reduced from 0.15
381
+ 'address_in_city': 0.15, # Reduced from 0.20
382
+ 'city_in_state': 0.15, # Increased from 0.10
383
+ 'state_in_country': 0.15, # Increased from 0.10
384
+ 'postal_code_valid': 0.15, # Increased from 0.10
385
+ 'postal_code_in_city': 0.10, # Keep same
386
+ 'coordinates_valid': 0.10, # Keep same
387
+ 'coordinates_in_city': 0.10 # Reduced from 0.15
388
  }
389
  completeness_score = sum(
390
  weights[key] * 100 if result else 0
391
  for key, result in verification_results.items()
392
  )
393
+
394
+ # Much more lenient criteria for location quality
395
+ critical_checks = ['city_in_state', 'state_in_country'] # Reduced critical checks
396
+ secondary_checks = ['address_format_valid', 'address_in_city', 'postal_code_valid', 'postal_code_in_city', 'coordinates_valid', 'coordinates_in_city']
397
+
398
+ # Location is verified if critical checks pass and at least 1 secondary check passes
399
  critical_passed = all(verification_results[check] for check in critical_checks)
400
  secondary_passed = sum(1 for check in secondary_checks if verification_results[check])
401
+ location_quality = "verified" if critical_passed and secondary_passed >= 1 else "unverified" # Reduced from 2 to 1
402
+
403
  # Analyze landmarks
404
  landmarks_analysis = {
405
  'provided': bool(data.get('nearby_landmarks')),
 
421
  if any(keyword in landmark for keyword in keywords):
422
  if type_name not in landmarks_analysis['types']:
423
  landmarks_analysis['types'].append(type_name)
424
+
425
  # Determine city tier
426
  city_tier = "unknown"
427
  if data.get('city'):
 
436
  city_tier = "tier2"
437
  else:
438
  city_tier = "tier3"
439
+
440
+ # Much more lenient assessment criteria
441
+ if completeness_score >= 60: # Reduced from 80
442
+ assessment = "complete"
443
+ elif completeness_score >= 30: # Reduced from 50
444
+ assessment = "partial"
445
+ else:
446
+ assessment = "minimal"
447
+
448
+ # Ensure minimum score for valid data
449
+ if completeness_score == 0 and (data.get('city') or data.get('state')):
450
+ completeness_score = 40 # Minimum 40% for having city/state
451
+
452
  return {
453
  **verification_results,
454
+ 'assessment': assessment,
455
  'completeness_score': completeness_score,
456
  'location_quality': location_quality,
457
  'city_tier': city_tier,
 
463
  logger.error(f"Error analyzing location: {str(e)}")
464
  return {
465
  'assessment': 'error',
466
+ 'completeness_score': 30, # Increased from 0 to 30
467
  'location_quality': 'error',
468
  'city_tier': 'unknown',
469
  'landmarks_analysis': {'provided': False, 'count': 0, 'types': []},
models/price_analysis.py CHANGED
@@ -550,50 +550,50 @@ def analyze_price(data, context_text=None, latitude=None, longitude=None, proper
550
  else:
551
  deviation = 0
552
 
553
- # Determine assessment based on deviation and price reasonableness
554
- if price_per_sqft < 100: # Extremely low price
555
  assessment = "suspicious_pricing"
556
- confidence = 0.1
557
- elif price_per_sqft < market_avg * 0.3: # Very below market
558
  assessment = "below_market"
559
- confidence = 0.3
560
- elif price_per_sqft < market_avg * 0.7: # Below market
561
  assessment = "below_market"
562
- confidence = 0.6
563
- elif price_per_sqft <= market_avg * 1.3: # Market rate
564
  assessment = "market_rate"
565
- confidence = 0.8
566
- elif price_per_sqft <= market_avg * 2.0: # Above market
567
  assessment = "above_market"
568
- confidence = 0.7
569
  else: # Very above market
570
  assessment = "premium_pricing"
571
- confidence = 0.5
572
 
573
- # Generate risk indicators
574
  risk_indicators = []
575
- if price_per_sqft < 100:
576
  risk_indicators.append("⚠️ Property priced extremely low (suspicious)")
577
- elif price_per_sqft < market_avg * 0.3:
578
  risk_indicators.append("⚠️ Property priced significantly below market average")
579
- elif price_per_sqft > market_avg * 2.0:
580
  risk_indicators.append("⚠️ Property priced significantly above market average")
581
 
582
- # Price ranges for the city
583
  price_ranges = {
584
  'budget': {
585
- 'min': market_avg * 0.5,
586
  'max': market_avg * 0.8,
587
  'description': f'Budget properties in {city}'
588
  },
589
  'mid_range': {
590
  'min': market_avg * 0.8,
591
- 'max': market_avg * 1.2,
592
  'description': f'Mid-range properties in {city}'
593
  },
594
  'premium': {
595
- 'min': market_avg * 1.2,
596
- 'max': market_avg * 2.0,
597
  'description': f'Premium properties in {city}'
598
  }
599
  }
 
550
  else:
551
  deviation = 0
552
 
553
+ # Determine assessment based on deviation and price reasonableness - Much more lenient
554
+ if price_per_sqft < 50: # Extremely low price - increased from 100
555
  assessment = "suspicious_pricing"
556
+ confidence = 0.2 # Increased from 0.1
557
+ elif price_per_sqft < market_avg * 0.2: # Very below market - reduced from 0.3
558
  assessment = "below_market"
559
+ confidence = 0.4 # Increased from 0.3
560
+ elif price_per_sqft < market_avg * 0.6: # Below market - reduced from 0.7
561
  assessment = "below_market"
562
+ confidence = 0.7 # Increased from 0.6
563
+ elif price_per_sqft <= market_avg * 1.5: # Market rate - increased from 1.3
564
  assessment = "market_rate"
565
+ confidence = 0.9 # Increased from 0.8
566
+ elif price_per_sqft <= market_avg * 2.5: # Above market - increased from 2.0
567
  assessment = "above_market"
568
+ confidence = 0.8 # Increased from 0.7
569
  else: # Very above market
570
  assessment = "premium_pricing"
571
+ confidence = 0.6 # Increased from 0.5
572
 
573
+ # Generate risk indicators - Much more lenient
574
  risk_indicators = []
575
+ if price_per_sqft < 50: # Increased from 100
576
  risk_indicators.append("⚠️ Property priced extremely low (suspicious)")
577
+ elif price_per_sqft < market_avg * 0.2: # Reduced from 0.3
578
  risk_indicators.append("⚠️ Property priced significantly below market average")
579
+ elif price_per_sqft > market_avg * 2.5: # Increased from 2.0
580
  risk_indicators.append("⚠️ Property priced significantly above market average")
581
 
582
+ # Price ranges for the city - Much more lenient
583
  price_ranges = {
584
  'budget': {
585
+ 'min': market_avg * 0.3, # Reduced from 0.5
586
  'max': market_avg * 0.8,
587
  'description': f'Budget properties in {city}'
588
  },
589
  'mid_range': {
590
  'min': market_avg * 0.8,
591
+ 'max': market_avg * 1.4, # Increased from 1.2
592
  'description': f'Mid-range properties in {city}'
593
  },
594
  'premium': {
595
+ 'min': market_avg * 1.4, # Reduced from 1.2
596
+ 'max': market_avg * 2.5, # Increased from 2.0
597
  'description': f'Premium properties in {city}'
598
  }
599
  }
models/trust_score.py CHANGED
@@ -6,60 +6,63 @@ import re
6
 
7
  def generate_trust_score(text, image_analysis, pdf_analysis):
8
  try:
9
- # Start with a more reasonable base score
10
- trust_score = 30.0 # Increased from 20.0 to give more reasonable starting point
11
  reasoning_parts = []
12
 
13
  # Simple text-based trust indicators
14
  text_lower = str(text).lower()
15
 
16
- # CRITICAL: Check for obvious fake data patterns - but be less punitive
17
  fake_patterns = [
18
- r'\b\d+\s*$', # Numbers at end of lines
19
- r'^\d+$', # Only numbers
20
- r'\b\d{1,2}\s*$', # Single or double digits
21
- r'price.*\d{1,3}', # Very low prices
22
- r'size.*\d{1,3}', # Very small sizes
23
- r'bedrooms.*\d{1,2}', # Very few bedrooms
24
- r'bathrooms.*\d{1,2}', # Very few bathrooms
25
  ]
26
 
27
  fake_detected = False
28
  for pattern in fake_patterns:
29
  if re.search(pattern, text_lower):
30
- fake_detected = True
31
- trust_score -= 15 # Reduced penalty from 30 to 15
32
- reasoning_parts.append("Detected suspicious number patterns")
33
- break
34
-
35
- # Check for repeated numbers (like "2, 2, 2, 2") - but be less punitive
 
 
 
 
 
 
36
  numbers = re.findall(r'\b\d+\b', text_lower)
37
- if len(numbers) >= 3:
38
  unique_numbers = set(numbers)
39
- if len(unique_numbers) <= 2: # If most numbers are the same
40
  fake_detected = True
41
- trust_score -= 20 # Reduced penalty from 40 to 20
42
  reasoning_parts.append("Detected repeated number patterns (likely fake data)")
43
 
44
- # Check for extremely low values - but be less punitive
45
- if any(word in text_lower for word in ['₹2', '₹1', '₹3', '₹4', '₹5']):
46
  fake_detected = True
47
- trust_score -= 25 # Reduced penalty from 50 to 25
48
  reasoning_parts.append("Detected suspiciously low pricing")
49
 
50
- # Check for very small property sizes - but be less punitive
51
- if any(word in text_lower for word in ['2 sq', '1 sq', '3 sq', '4 sq', '5 sq']):
52
  fake_detected = True
53
- trust_score -= 20 # Reduced penalty from 40 to 20
54
  reasoning_parts.append("Detected suspiciously small property size")
55
 
56
- # Positive trust indicators - More generous
57
  positive_indicators = [
58
  'apartment', 'flat', 'house', 'villa', 'bungalow', 'property', 'real estate',
59
  'bedroom', 'bathroom', 'kitchen', 'living', 'dining', 'balcony', 'parking',
60
  'amenities', 'facilities', 'security', 'lift', 'gym', 'pool', 'garden',
61
  'hyderabad', 'mumbai', 'delhi', 'bangalore', 'chennai', 'kolkata', 'pune',
62
- 'verified', 'authentic', 'genuine', 'legitimate', 'original', 'certified'
 
63
  ]
64
 
65
  negative_indicators = [
@@ -70,82 +73,82 @@ def generate_trust_score(text, image_analysis, pdf_analysis):
70
  positive_count = sum(1 for indicator in positive_indicators if indicator in text_lower)
71
  negative_count = sum(1 for indicator in negative_indicators if indicator in text_lower)
72
 
73
- # Adjust score based on indicators - More balanced
74
  if positive_count > 0 and not fake_detected:
75
- trust_score += min(20, positive_count * 3) # Increased from 15 to 20
76
  reasoning_parts.append(f"Found {positive_count} positive trust indicators")
77
 
78
  if negative_count > 0:
79
- trust_score -= min(25, negative_count * 5) # Reduced penalty from 30 to 25
80
  reasoning_parts.append(f"Found {negative_count} negative trust indicators")
81
 
82
- # Image analysis contribution - More balanced
83
  if image_analysis:
84
  image_count = len(image_analysis) if isinstance(image_analysis, list) else 1
85
  if image_count > 0:
86
  # Check if images are actually property-related
87
  property_related_count = sum(1 for img in image_analysis if img.get('is_property_related', False))
88
  if property_related_count > 0:
89
- trust_score += min(15, property_related_count * 4) # Increased from 10 to 15
90
  reasoning_parts.append(f"Property has {property_related_count} property-related images")
91
  else:
92
- trust_score -= 15 # Reduced penalty from 20 to 15
93
  reasoning_parts.append("No property-related images detected")
94
 
95
  # Bonus for multiple high-quality images
96
  if property_related_count >= 3:
97
- trust_score += 8 # Increased from 5 to 8
98
  reasoning_parts.append("Multiple property images provided")
99
 
100
- # PDF analysis contribution - More balanced
101
  if pdf_analysis:
102
  pdf_count = len(pdf_analysis) if isinstance(pdf_analysis, list) else 1
103
  if pdf_count > 0:
104
  # Check if documents are actually property-related
105
  property_related_docs = sum(1 for doc in pdf_analysis if doc.get('is_property_related', False))
106
  if property_related_docs > 0:
107
- trust_score += min(15, property_related_docs * 5) # Increased from 10 to 15
108
  reasoning_parts.append(f"Property has {property_related_docs} property-related documents")
109
  else:
110
- trust_score -= 10 # Reduced penalty from 15 to 10
111
  reasoning_parts.append("No property-related documents detected")
112
 
113
  # Bonus for multiple documents
114
  if property_related_docs >= 2:
115
- trust_score += 5 # Increased from 3 to 5
116
  reasoning_parts.append("Multiple supporting documents provided")
117
 
118
- # Text quality assessment - More balanced
119
  if text and len(text) > 200 and not fake_detected:
120
- trust_score += 12 # Increased from 8 to 12
121
  reasoning_parts.append("Detailed property description provided")
122
  elif text and len(text) > 100 and not fake_detected:
123
- trust_score += 8 # Increased from 4 to 8
124
  reasoning_parts.append("Adequate property description provided")
125
  elif len(text) < 50:
126
- trust_score -= 15 # Reduced penalty from 20 to 15
127
  reasoning_parts.append("Very short property description")
128
 
129
- # Location quality assessment - More balanced
130
  if 'hyderabad' in text_lower or 'mumbai' in text_lower or 'delhi' in text_lower or 'bangalore' in text_lower:
131
  if not fake_detected:
132
- trust_score += 5 # Increased from 3 to 5
133
  reasoning_parts.append("Property in major city")
134
 
135
- # Property type assessment - More balanced
136
- if any(prop_type in text_lower for prop_type in ['apartment', 'flat', 'house', 'villa', 'bungalow']):
137
  if not fake_detected:
138
- trust_score += 4 # Increased from 2 to 4
139
  reasoning_parts.append("Clear property type mentioned")
140
 
141
- # Amenities assessment - More balanced
142
  amenities_count = sum(1 for amenity in ['pool', 'gym', 'garden', 'parking', 'security', 'lift', 'balcony']
143
  if amenity in text_lower)
144
  if amenities_count > 0 and not fake_detected:
145
- trust_score += min(8, amenities_count * 2) # Increased from 5 to 8
146
  reasoning_parts.append(f"Property has {amenities_count} amenities mentioned")
147
 
148
- # CRITICAL: Additional fake data checks - but be less punitive
149
  # Check if all major fields are just numbers
150
  numeric_fields = ['property_name', 'bedrooms', 'bathrooms', 'sq_ft', 'market_value']
151
  numeric_count = 0
@@ -153,14 +156,14 @@ def generate_trust_score(text, image_analysis, pdf_analysis):
153
  if field in text_lower and re.search(r'\b\d{1,2}\b', text_lower):
154
  numeric_count += 1
155
 
156
- if numeric_count >= 3: # If 3+ fields are just numbers
157
  fake_detected = True
158
- trust_score -= 30 # Reduced penalty from 60 to 30
159
  reasoning_parts.append("Multiple fields contain only numbers (highly suspicious)")
160
 
161
  # Ensure minimum score for any valid data
162
- if trust_score < 10 and (image_analysis or pdf_analysis):
163
- trust_score = 10 # Minimum score if there are images or documents
164
 
165
  # Ensure score is within bounds
166
  trust_score = max(0, min(100, trust_score))
@@ -175,4 +178,4 @@ def generate_trust_score(text, image_analysis, pdf_analysis):
175
 
176
  except Exception as e:
177
  logger.error(f"Error in trust score generation: {str(e)}")
178
- return 20.0, f"Trust analysis failed: {str(e)}" # Increased from 10.0 to 20.0
 
6
 
7
  def generate_trust_score(text, image_analysis, pdf_analysis):
8
  try:
9
+ # Start with a much higher base score for legitimate properties
10
+ trust_score = 50.0 # Increased from 30.0 to 50.0 to give more reasonable starting point
11
  reasoning_parts = []
12
 
13
  # Simple text-based trust indicators
14
  text_lower = str(text).lower()
15
 
16
+ # CRITICAL: Check for obvious fake data patterns - but be much less punitive
17
  fake_patterns = [
18
+ r'^\d+$', # Only numbers (very strict)
19
+ r'price.*\d{1,2}', # Very low prices (more lenient)
20
+ r'size.*\d{1,2}', # Very small sizes (more lenient)
 
 
 
 
21
  ]
22
 
23
  fake_detected = False
24
  for pattern in fake_patterns:
25
  if re.search(pattern, text_lower):
26
+ # Only mark as fake if it's extremely obvious
27
+ if pattern == r'^\d+$' and len(text.strip()) <= 3:
28
+ fake_detected = True
29
+ trust_score -= 10 # Reduced penalty from 15 to 10
30
+ reasoning_parts.append("Detected suspicious number patterns")
31
+ break
32
+ # For other patterns, be more lenient
33
+ elif pattern in [r'price.*\d{1,2}', r'size.*\d{1,2}']:
34
+ # Only mark as fake if multiple patterns are found
35
+ continue
36
+
37
+ # Check for repeated numbers (like "2, 2, 2, 2") - but be much less punitive
38
  numbers = re.findall(r'\b\d+\b', text_lower)
39
+ if len(numbers) >= 5: # Increased threshold from 3 to 5
40
  unique_numbers = set(numbers)
41
+ if len(unique_numbers) <= 1: # Only if ALL numbers are the same
42
  fake_detected = True
43
+ trust_score -= 15 # Reduced penalty from 20 to 15
44
  reasoning_parts.append("Detected repeated number patterns (likely fake data)")
45
 
46
+ # Check for extremely low values - but be much less punitive
47
+ if any(word in text_lower for word in ['₹1', '₹2']): # Only extremely low values
48
  fake_detected = True
49
+ trust_score -= 20 # Reduced penalty from 25 to 20
50
  reasoning_parts.append("Detected suspiciously low pricing")
51
 
52
+ # Check for very small property sizes - but be much less punitive
53
+ if any(word in text_lower for word in ['1 sq', '2 sq']): # Only extremely small
54
  fake_detected = True
55
+ trust_score -= 15 # Reduced penalty from 20 to 15
56
  reasoning_parts.append("Detected suspiciously small property size")
57
 
58
+ # Positive trust indicators - Much more generous
59
  positive_indicators = [
60
  'apartment', 'flat', 'house', 'villa', 'bungalow', 'property', 'real estate',
61
  'bedroom', 'bathroom', 'kitchen', 'living', 'dining', 'balcony', 'parking',
62
  'amenities', 'facilities', 'security', 'lift', 'gym', 'pool', 'garden',
63
  'hyderabad', 'mumbai', 'delhi', 'bangalore', 'chennai', 'kolkata', 'pune',
64
+ 'verified', 'authentic', 'genuine', 'legitimate', 'original', 'certified',
65
+ 'pg', 'hostel', 'office', 'commercial', 'retail', 'warehouse', 'industrial'
66
  ]
67
 
68
  negative_indicators = [
 
73
  positive_count = sum(1 for indicator in positive_indicators if indicator in text_lower)
74
  negative_count = sum(1 for indicator in negative_indicators if indicator in text_lower)
75
 
76
+ # Adjust score based on indicators - Much more balanced
77
  if positive_count > 0 and not fake_detected:
78
+ trust_score += min(25, positive_count * 4) # Increased from 20 to 25
79
  reasoning_parts.append(f"Found {positive_count} positive trust indicators")
80
 
81
  if negative_count > 0:
82
+ trust_score -= min(20, negative_count * 4) # Reduced penalty from 25 to 20
83
  reasoning_parts.append(f"Found {negative_count} negative trust indicators")
84
 
85
+ # Image analysis contribution - Much more balanced
86
  if image_analysis:
87
  image_count = len(image_analysis) if isinstance(image_analysis, list) else 1
88
  if image_count > 0:
89
  # Check if images are actually property-related
90
  property_related_count = sum(1 for img in image_analysis if img.get('is_property_related', False))
91
  if property_related_count > 0:
92
+ trust_score += min(20, property_related_count * 5) # Increased from 15 to 20
93
  reasoning_parts.append(f"Property has {property_related_count} property-related images")
94
  else:
95
+ trust_score -= 10 # Reduced penalty from 15 to 10
96
  reasoning_parts.append("No property-related images detected")
97
 
98
  # Bonus for multiple high-quality images
99
  if property_related_count >= 3:
100
+ trust_score += 12 # Increased from 8 to 12
101
  reasoning_parts.append("Multiple property images provided")
102
 
103
+ # PDF analysis contribution - Much more balanced
104
  if pdf_analysis:
105
  pdf_count = len(pdf_analysis) if isinstance(pdf_analysis, list) else 1
106
  if pdf_count > 0:
107
  # Check if documents are actually property-related
108
  property_related_docs = sum(1 for doc in pdf_analysis if doc.get('is_property_related', False))
109
  if property_related_docs > 0:
110
+ trust_score += min(20, property_related_docs * 6) # Increased from 15 to 20
111
  reasoning_parts.append(f"Property has {property_related_docs} property-related documents")
112
  else:
113
+ trust_score -= 8 # Reduced penalty from 10 to 8
114
  reasoning_parts.append("No property-related documents detected")
115
 
116
  # Bonus for multiple documents
117
  if property_related_docs >= 2:
118
+ trust_score += 8 # Increased from 5 to 8
119
  reasoning_parts.append("Multiple supporting documents provided")
120
 
121
+ # Text quality assessment - Much more balanced
122
  if text and len(text) > 200 and not fake_detected:
123
+ trust_score += 15 # Increased from 12 to 15
124
  reasoning_parts.append("Detailed property description provided")
125
  elif text and len(text) > 100 and not fake_detected:
126
+ trust_score += 10 # Increased from 8 to 10
127
  reasoning_parts.append("Adequate property description provided")
128
  elif len(text) < 50:
129
+ trust_score -= 10 # Reduced penalty from 15 to 10
130
  reasoning_parts.append("Very short property description")
131
 
132
+ # Location quality assessment - Much more balanced
133
  if 'hyderabad' in text_lower or 'mumbai' in text_lower or 'delhi' in text_lower or 'bangalore' in text_lower:
134
  if not fake_detected:
135
+ trust_score += 8 # Increased from 5 to 8
136
  reasoning_parts.append("Property in major city")
137
 
138
+ # Property type assessment - Much more balanced
139
+ if any(prop_type in text_lower for prop_type in ['apartment', 'flat', 'house', 'villa', 'bungalow', 'pg', 'office']):
140
  if not fake_detected:
141
+ trust_score += 6 # Increased from 4 to 6
142
  reasoning_parts.append("Clear property type mentioned")
143
 
144
+ # Amenities assessment - Much more balanced
145
  amenities_count = sum(1 for amenity in ['pool', 'gym', 'garden', 'parking', 'security', 'lift', 'balcony']
146
  if amenity in text_lower)
147
  if amenities_count > 0 and not fake_detected:
148
+ trust_score += min(12, amenities_count * 3) # Increased from 8 to 12
149
  reasoning_parts.append(f"Property has {amenities_count} amenities mentioned")
150
 
151
+ # CRITICAL: Additional fake data checks - but be much less punitive
152
  # Check if all major fields are just numbers
153
  numeric_fields = ['property_name', 'bedrooms', 'bathrooms', 'sq_ft', 'market_value']
154
  numeric_count = 0
 
156
  if field in text_lower and re.search(r'\b\d{1,2}\b', text_lower):
157
  numeric_count += 1
158
 
159
+ if numeric_count >= 4: # Increased threshold from 3 to 4
160
  fake_detected = True
161
+ trust_score -= 25 # Reduced penalty from 30 to 25
162
  reasoning_parts.append("Multiple fields contain only numbers (highly suspicious)")
163
 
164
  # Ensure minimum score for any valid data
165
+ if trust_score < 20 and (image_analysis or pdf_analysis):
166
+ trust_score = 20 # Increased minimum score from 10 to 20
167
 
168
  # Ensure score is within bounds
169
  trust_score = max(0, min(100, trust_score))
 
178
 
179
  except Exception as e:
180
  logger.error(f"Error in trust score generation: {str(e)}")
181
+ return 35.0, f"Trust analysis failed: {str(e)}" # Increased from 20.0 to 35.0