Upload 22 files
Browse files- app.py +85 -45
- models/cross_validation.py +333 -153
- models/fraud_classification.py +118 -135
- models/trust_score.py +136 -17
app.py
CHANGED
@@ -258,20 +258,48 @@ def calculate_final_verdict(results):
|
|
258 |
specs_verification = results.get('specs_verification', {})
|
259 |
quality_assessment = results.get('quality_assessment', {})
|
260 |
|
261 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
fraud_score = 0.0
|
263 |
fraud_level = fraud_classification.get('alert_level', 'minimal')
|
264 |
fraud_alert_score = fraud_classification.get('alert_score', 0.0)
|
265 |
|
266 |
fraud_score_mapping = {
|
267 |
-
'critical': 1.0,
|
268 |
-
'high': 0.8,
|
269 |
-
'medium': 0.6,
|
270 |
-
'low': 0.
|
271 |
-
'minimal': 0.1
|
272 |
}
|
273 |
fraud_score = fraud_score_mapping.get(fraud_level, 0.1) * fraud_alert_score
|
274 |
|
|
|
|
|
|
|
|
|
|
|
275 |
# Calculate trust score
|
276 |
trust_score = 0.0
|
277 |
if isinstance(trust_score_data, dict):
|
@@ -287,6 +315,10 @@ def calculate_final_verdict(results):
|
|
287 |
else:
|
288 |
trust_score = 0.0
|
289 |
|
|
|
|
|
|
|
|
|
290 |
# Calculate address verification score
|
291 |
address_score = 0.0
|
292 |
if address_verification and isinstance(address_verification, dict):
|
@@ -323,31 +355,16 @@ def calculate_final_verdict(results):
|
|
323 |
score = quality_assessment.get('score', 0.0)
|
324 |
quality_score = float(score) / 100.0 if score > 0 else 0.0
|
325 |
|
326 |
-
#
|
327 |
-
cross_validation_issues = 0
|
328 |
-
high_severity_issues = 0
|
329 |
-
medium_severity_issues = 0
|
330 |
-
|
331 |
-
if isinstance(cross_validation, list):
|
332 |
-
cross_validation_issues = len(cross_validation)
|
333 |
-
for issue in cross_validation:
|
334 |
-
if isinstance(issue, dict):
|
335 |
-
severity = issue.get('severity', 'low')
|
336 |
-
if severity == 'high':
|
337 |
-
high_severity_issues += 1
|
338 |
-
elif severity == 'medium':
|
339 |
-
medium_severity_issues += 1
|
340 |
-
|
341 |
-
# Weighted scoring system with improved weights
|
342 |
weights = {
|
343 |
-
'fraud': 0.
|
344 |
-
'trust': 0.25, #
|
345 |
'address': 0.15, # Address verification
|
346 |
'location': 0.10, # Location analysis
|
347 |
'price': 0.10, # Price analysis
|
348 |
-
'legal': 0.
|
349 |
-
'specs': 0.
|
350 |
-
'quality': 0.
|
351 |
}
|
352 |
|
353 |
# Calculate weighted score
|
@@ -366,32 +383,41 @@ def calculate_final_verdict(results):
|
|
366 |
logger.info(f"Score components: fraud={fraud_score:.3f}, trust={trust_score:.3f}, address={address_score:.3f}, location={location_score:.3f}, price={price_score:.3f}, legal={legal_score:.3f}, specs={specs_score:.3f}, quality={quality_score:.3f}")
|
367 |
logger.info(f"Weighted score before penalty: {weighted_score:.3f}")
|
368 |
|
369 |
-
#
|
370 |
issue_penalty = 0.0
|
|
|
|
|
371 |
if high_severity_issues > 0:
|
372 |
-
issue_penalty += high_severity_issues * 0.
|
373 |
if medium_severity_issues > 0:
|
374 |
-
issue_penalty += medium_severity_issues * 0.
|
|
|
|
|
375 |
|
376 |
weighted_score = max(0.0, weighted_score - issue_penalty)
|
377 |
|
378 |
logger.info(f"Issue penalty: {issue_penalty:.3f}, Final weighted score: {weighted_score:.3f}")
|
379 |
|
380 |
-
#
|
381 |
-
if
|
382 |
-
weighted_score = max(0.
|
|
|
|
|
383 |
|
384 |
-
#
|
385 |
-
if
|
|
|
|
|
|
|
386 |
verdict = 'VERIFIED REAL ESTATE LISTING'
|
387 |
risk_level = 'low'
|
388 |
-
elif weighted_score >= 0.60 and fraud_score < 0.
|
389 |
verdict = 'LIKELY LEGITIMATE'
|
390 |
risk_level = 'low'
|
391 |
-
elif weighted_score >= 0.40 and fraud_score < 0.
|
392 |
verdict = 'SUSPICIOUS LISTING'
|
393 |
risk_level = 'medium'
|
394 |
-
elif fraud_score >= 0.
|
395 |
verdict = 'HIGH RISK LISTING'
|
396 |
risk_level = 'high'
|
397 |
elif weighted_score >= 0.20:
|
@@ -404,10 +430,16 @@ def calculate_final_verdict(results):
|
|
404 |
# Generate detailed reasoning
|
405 |
reasoning_parts = []
|
406 |
|
|
|
|
|
|
|
|
|
|
|
|
|
407 |
if fraud_score > 0.3:
|
408 |
reasoning_parts.append(f"Fraud risk detected (level: {fraud_level})")
|
409 |
|
410 |
-
if trust_score < 0.
|
411 |
reasoning_parts.append(f"Low trust score ({trust_score:.1%})")
|
412 |
|
413 |
if address_score < 0.5:
|
@@ -439,12 +471,16 @@ def calculate_final_verdict(results):
|
|
439 |
# Ensure score is between 0 and 100
|
440 |
overall_score = max(0, min(100, overall_score))
|
441 |
|
442 |
-
#
|
443 |
-
if
|
|
|
|
|
444 |
overall_score = 15 # Minimum 15% score if any component is valid
|
445 |
|
446 |
-
# Final score adjustment based on data quality
|
447 |
-
if
|
|
|
|
|
448 |
overall_score = max(10, overall_score) # Minimum 10% for high risk
|
449 |
elif high_severity_issues >= 1:
|
450 |
overall_score = max(15, overall_score) # Minimum 15% for medium risk
|
@@ -467,8 +503,12 @@ def calculate_final_verdict(results):
|
|
467 |
'specs_score': specs_score,
|
468 |
'quality_score': quality_score,
|
469 |
'weighted_score': weighted_score,
|
470 |
-
'cross_validation_issues':
|
471 |
-
'high_severity_issues': high_severity_issues
|
|
|
|
|
|
|
|
|
472 |
}
|
473 |
}
|
474 |
|
|
|
258 |
specs_verification = results.get('specs_verification', {})
|
259 |
quality_assessment = results.get('quality_assessment', {})
|
260 |
|
261 |
+
# CRITICAL: Check for fake data patterns in cross validation
|
262 |
+
fake_data_detected = False
|
263 |
+
fraudulent_issues = 0
|
264 |
+
high_severity_issues = 0
|
265 |
+
medium_severity_issues = 0
|
266 |
+
low_severity_issues = 0
|
267 |
+
|
268 |
+
if isinstance(cross_validation, list):
|
269 |
+
for issue in cross_validation:
|
270 |
+
if isinstance(issue, dict):
|
271 |
+
status = issue.get('status', '')
|
272 |
+
severity = issue.get('severity', 'low')
|
273 |
+
|
274 |
+
if status == 'fraudulent':
|
275 |
+
fraudulent_issues += 1
|
276 |
+
fake_data_detected = True
|
277 |
+
elif severity == 'high':
|
278 |
+
high_severity_issues += 1
|
279 |
+
elif severity == 'medium':
|
280 |
+
medium_severity_issues += 1
|
281 |
+
elif severity == 'low':
|
282 |
+
low_severity_issues += 1
|
283 |
+
|
284 |
+
# Calculate fraud risk score - Much stricter
|
285 |
fraud_score = 0.0
|
286 |
fraud_level = fraud_classification.get('alert_level', 'minimal')
|
287 |
fraud_alert_score = fraud_classification.get('alert_score', 0.0)
|
288 |
|
289 |
fraud_score_mapping = {
|
290 |
+
'critical': 1.0, # Increased back to full penalty
|
291 |
+
'high': 0.8, # Increased back to full penalty
|
292 |
+
'medium': 0.6, # Increased back to full penalty
|
293 |
+
'low': 0.4, # Increased penalty
|
294 |
+
'minimal': 0.1 # Increased penalty
|
295 |
}
|
296 |
fraud_score = fraud_score_mapping.get(fraud_level, 0.1) * fraud_alert_score
|
297 |
|
298 |
+
# CRITICAL: Heavy penalty for fake data
|
299 |
+
if fake_data_detected:
|
300 |
+
fraud_score = max(fraud_score, 0.8) # Minimum 80% fraud score for fake data
|
301 |
+
fraud_level = 'high'
|
302 |
+
|
303 |
# Calculate trust score
|
304 |
trust_score = 0.0
|
305 |
if isinstance(trust_score_data, dict):
|
|
|
315 |
else:
|
316 |
trust_score = 0.0
|
317 |
|
318 |
+
# CRITICAL: Heavy penalty for fake data in trust score
|
319 |
+
if fake_data_detected:
|
320 |
+
trust_score = max(0.0, trust_score - 0.5) # Reduce trust score by 50% for fake data
|
321 |
+
|
322 |
# Calculate address verification score
|
323 |
address_score = 0.0
|
324 |
if address_verification and isinstance(address_verification, dict):
|
|
|
355 |
score = quality_assessment.get('score', 0.0)
|
356 |
quality_score = float(score) / 100.0 if score > 0 else 0.0
|
357 |
|
358 |
+
# Much stricter weighted scoring system
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
359 |
weights = {
|
360 |
+
'fraud': 0.35, # Increased weight for fraud detection
|
361 |
+
'trust': 0.25, # Keep trust score important
|
362 |
'address': 0.15, # Address verification
|
363 |
'location': 0.10, # Location analysis
|
364 |
'price': 0.10, # Price analysis
|
365 |
+
'legal': 0.03, # Legal analysis
|
366 |
+
'specs': 0.01, # Specs verification
|
367 |
+
'quality': 0.01 # Quality assessment
|
368 |
}
|
369 |
|
370 |
# Calculate weighted score
|
|
|
383 |
logger.info(f"Score components: fraud={fraud_score:.3f}, trust={trust_score:.3f}, address={address_score:.3f}, location={location_score:.3f}, price={price_score:.3f}, legal={legal_score:.3f}, specs={specs_score:.3f}, quality={quality_score:.3f}")
|
384 |
logger.info(f"Weighted score before penalty: {weighted_score:.3f}")
|
385 |
|
386 |
+
# Much stricter penalty system
|
387 |
issue_penalty = 0.0
|
388 |
+
if fraudulent_issues > 0:
|
389 |
+
issue_penalty += fraudulent_issues * 0.15 # 15% penalty per fraudulent issue
|
390 |
if high_severity_issues > 0:
|
391 |
+
issue_penalty += high_severity_issues * 0.10 # 10% penalty per high severity issue
|
392 |
if medium_severity_issues > 0:
|
393 |
+
issue_penalty += medium_severity_issues * 0.05 # 5% penalty per medium severity issue
|
394 |
+
if low_severity_issues > 0:
|
395 |
+
issue_penalty += low_severity_issues * 0.02 # 2% penalty per low severity issue
|
396 |
|
397 |
weighted_score = max(0.0, weighted_score - issue_penalty)
|
398 |
|
399 |
logger.info(f"Issue penalty: {issue_penalty:.3f}, Final weighted score: {weighted_score:.3f}")
|
400 |
|
401 |
+
# CRITICAL: Much stricter minimum score requirements
|
402 |
+
if fake_data_detected:
|
403 |
+
weighted_score = max(0.05, weighted_score) # Maximum 5% score for fake data
|
404 |
+
elif any([trust_score > 0, address_score > 0, location_score > 0, price_score > 0]):
|
405 |
+
weighted_score = max(0.15, weighted_score) # Minimum 15% for any valid data
|
406 |
|
407 |
+
# Much stricter verdict determination
|
408 |
+
if fake_data_detected or fraudulent_issues > 0:
|
409 |
+
verdict = 'HIGH RISK LISTING'
|
410 |
+
risk_level = 'high'
|
411 |
+
elif weighted_score >= 0.75 and fraud_score < 0.2 and high_severity_issues == 0:
|
412 |
verdict = 'VERIFIED REAL ESTATE LISTING'
|
413 |
risk_level = 'low'
|
414 |
+
elif weighted_score >= 0.60 and fraud_score < 0.3 and high_severity_issues <= 1:
|
415 |
verdict = 'LIKELY LEGITIMATE'
|
416 |
risk_level = 'low'
|
417 |
+
elif weighted_score >= 0.40 and fraud_score < 0.5 and high_severity_issues <= 2:
|
418 |
verdict = 'SUSPICIOUS LISTING'
|
419 |
risk_level = 'medium'
|
420 |
+
elif fraud_score >= 0.5 or weighted_score < 0.20 or high_severity_issues >= 3:
|
421 |
verdict = 'HIGH RISK LISTING'
|
422 |
risk_level = 'high'
|
423 |
elif weighted_score >= 0.20:
|
|
|
430 |
# Generate detailed reasoning
|
431 |
reasoning_parts = []
|
432 |
|
433 |
+
if fake_data_detected:
|
434 |
+
reasoning_parts.append("Fake data patterns detected")
|
435 |
+
|
436 |
+
if fraudulent_issues > 0:
|
437 |
+
reasoning_parts.append(f"{fraudulent_issues} fraudulent validation issues")
|
438 |
+
|
439 |
if fraud_score > 0.3:
|
440 |
reasoning_parts.append(f"Fraud risk detected (level: {fraud_level})")
|
441 |
|
442 |
+
if trust_score < 0.3:
|
443 |
reasoning_parts.append(f"Low trust score ({trust_score:.1%})")
|
444 |
|
445 |
if address_score < 0.5:
|
|
|
471 |
# Ensure score is between 0 and 100
|
472 |
overall_score = max(0, min(100, overall_score))
|
473 |
|
474 |
+
# CRITICAL: Much stricter minimum score for fake data
|
475 |
+
if fake_data_detected:
|
476 |
+
overall_score = max(5, min(15, overall_score)) # 5-15% range for fake data
|
477 |
+
elif overall_score == 0 and any([trust_score > 0, address_score > 0, location_score > 0]):
|
478 |
overall_score = 15 # Minimum 15% score if any component is valid
|
479 |
|
480 |
+
# Final score adjustment based on data quality - Much stricter
|
481 |
+
if fake_data_detected or fraudulent_issues > 0:
|
482 |
+
overall_score = max(5, min(15, overall_score)) # 5-15% for fake/fraudulent data
|
483 |
+
elif high_severity_issues >= 3:
|
484 |
overall_score = max(10, overall_score) # Minimum 10% for high risk
|
485 |
elif high_severity_issues >= 1:
|
486 |
overall_score = max(15, overall_score) # Minimum 15% for medium risk
|
|
|
503 |
'specs_score': specs_score,
|
504 |
'quality_score': quality_score,
|
505 |
'weighted_score': weighted_score,
|
506 |
+
'cross_validation_issues': len(cross_validation) if isinstance(cross_validation, list) else 0,
|
507 |
+
'high_severity_issues': high_severity_issues,
|
508 |
+
'medium_severity_issues': medium_severity_issues,
|
509 |
+
'low_severity_issues': low_severity_issues,
|
510 |
+
'fraudulent_issues': fraudulent_issues,
|
511 |
+
'fake_data_detected': fake_data_detected
|
512 |
}
|
513 |
}
|
514 |
|
models/cross_validation.py
CHANGED
@@ -69,11 +69,11 @@ def analyze_property_description(description: str, property_data: Dict[str, Any]
|
|
69 |
'suspicious_patterns': []
|
70 |
}
|
71 |
|
72 |
-
# Check room number consistency
|
73 |
if 'bedroom' in analysis['room_mentions']:
|
74 |
stated_bedrooms = safe_int_convert(property_data.get('bedrooms', 0))
|
75 |
mentioned_bedrooms = max(analysis['room_mentions']['bedroom'])
|
76 |
-
if stated_bedrooms != mentioned_bedrooms:
|
77 |
analysis['inconsistencies'].append({
|
78 |
'type': 'bedroom_count',
|
79 |
'stated': stated_bedrooms,
|
@@ -84,7 +84,7 @@ def analyze_property_description(description: str, property_data: Dict[str, Any]
|
|
84 |
if 'bathroom' in analysis['room_mentions']:
|
85 |
stated_bathrooms = safe_float_convert(property_data.get('bathrooms', 0))
|
86 |
mentioned_bathrooms = max(analysis['room_mentions']['bathroom'])
|
87 |
-
if abs(stated_bathrooms - mentioned_bathrooms) > 0
|
88 |
analysis['inconsistencies'].append({
|
89 |
'type': 'bathroom_count',
|
90 |
'stated': stated_bathrooms,
|
@@ -92,30 +92,47 @@ def analyze_property_description(description: str, property_data: Dict[str, Any]
|
|
92 |
'message': f'Description mentions {mentioned_bathrooms} bathrooms but listing states {stated_bathrooms} bathrooms.'
|
93 |
})
|
94 |
|
95 |
-
# Check property type consistency
|
96 |
property_type = property_data.get('property_type', '').lower()
|
97 |
-
if property_type
|
98 |
-
|
99 |
-
|
100 |
-
'
|
101 |
-
'
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
-
# Check for suspicious patterns
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
(r'no.*verification', 'Avoiding verification'),
|
109 |
-
(r'urgent.*sale', 'Pressure tactics'),
|
110 |
-
(r'below.*market', 'Unrealistic pricing')
|
111 |
]
|
112 |
|
113 |
-
|
114 |
-
|
|
|
115 |
analysis['suspicious_patterns'].append({
|
116 |
-
'pattern':
|
117 |
-
'
|
118 |
-
'message': f'Suspicious pattern detected: {reason}'
|
119 |
})
|
120 |
|
121 |
return analysis
|
@@ -425,149 +442,312 @@ def analyze_documents_and_images(data: Dict[str, Any]) -> Dict[str, Any]:
|
|
425 |
return analysis
|
426 |
|
427 |
def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
428 |
-
"""
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
try:
|
433 |
-
# Load the tiny model for classification with fallback
|
434 |
-
try:
|
435 |
-
classifier = load_model("zero-shot-classification")
|
436 |
-
except Exception as e:
|
437 |
-
logger.warning(f"Could not load classifier for cross validation: {str(e)}")
|
438 |
-
classifier = None
|
439 |
-
|
440 |
-
# Initialize analysis sections
|
441 |
analysis_sections = {
|
442 |
'basic_info': [],
|
443 |
'location': [],
|
444 |
'specifications': [],
|
445 |
'documents': [],
|
446 |
-
'
|
|
|
|
|
447 |
}
|
448 |
-
|
449 |
-
# Process and validate data
|
450 |
-
processed_data = {}
|
451 |
|
452 |
-
#
|
453 |
-
|
454 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
455 |
analysis_sections['basic_info'].append({
|
456 |
-
'check': '
|
457 |
-
'status': '
|
458 |
-
'message': '
|
459 |
-
'details': '
|
460 |
'severity': 'high',
|
461 |
-
'recommendation': '
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
462 |
})
|
463 |
|
464 |
-
|
|
|
465 |
if not property_type:
|
466 |
analysis_sections['basic_info'].append({
|
467 |
-
'check': '
|
468 |
'status': 'missing',
|
469 |
'message': 'Property type is required.',
|
470 |
-
'details': 'Please specify the type
|
471 |
-
'severity': 'high',
|
472 |
-
'recommendation': '
|
473 |
})
|
474 |
|
475 |
-
|
|
|
476 |
if not status:
|
477 |
analysis_sections['basic_info'].append({
|
478 |
-
'check': '
|
479 |
'status': 'missing',
|
480 |
'message': 'Property status is required.',
|
481 |
-
'details': 'Please specify if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
482 |
'severity': 'high',
|
483 |
-
'recommendation': '
|
484 |
})
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
'
|
491 |
-
'
|
492 |
-
'message': 'Invalid market value.',
|
493 |
-
'details': 'The market value must be a realistic amount.',
|
494 |
'severity': 'high',
|
495 |
-
'recommendation': '
|
496 |
})
|
497 |
-
|
498 |
-
|
499 |
-
location_analysis = analyze_location_consistency(data)
|
500 |
-
for inconsistency in location_analysis['inconsistencies']:
|
501 |
analysis_sections['location'].append({
|
502 |
-
'check':
|
503 |
-
'status': '
|
504 |
-
'message':
|
505 |
-
'details':
|
506 |
'severity': 'high',
|
507 |
-
'recommendation': '
|
508 |
})
|
509 |
-
|
510 |
-
#
|
511 |
-
|
512 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
513 |
analysis_sections['specifications'].append({
|
514 |
-
'check':
|
515 |
-
'status': '
|
516 |
-
'message':
|
517 |
-
'details': f'
|
518 |
-
'severity': 'high',
|
519 |
-
'recommendation': '
|
520 |
})
|
521 |
|
522 |
-
|
523 |
analysis_sections['specifications'].append({
|
524 |
-
'check':
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
525 |
'status': 'suspicious',
|
526 |
-
'message':
|
527 |
-
'details': f'
|
528 |
'severity': 'medium',
|
529 |
-
'recommendation': '
|
530 |
})
|
531 |
-
|
532 |
-
#
|
533 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
534 |
if description:
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
'
|
541 |
-
'
|
|
|
|
|
542 |
'severity': 'high',
|
543 |
-
'recommendation': '
|
544 |
})
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
'
|
549 |
-
'
|
550 |
-
'
|
551 |
-
'
|
552 |
-
'
|
553 |
-
'recommendation': 'Please review the property description for accuracy.'
|
554 |
})
|
555 |
-
|
556 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
557 |
media_analysis = analyze_documents_and_images(data)
|
558 |
|
559 |
-
# Helper function to check if files exist in data
|
560 |
def check_files_exist(files):
|
|
|
561 |
if not files:
|
562 |
return False
|
563 |
if isinstance(files, str):
|
564 |
files = [files]
|
565 |
-
|
566 |
-
|
567 |
-
|
|
|
|
|
|
|
|
|
|
|
568 |
if media_analysis['total_documents'] == 0:
|
569 |
-
# Check if documents were actually provided in the data
|
570 |
-
documents = data.get('documents', [])
|
571 |
if check_files_exist(documents):
|
572 |
# Files exist but couldn't be analyzed
|
573 |
analysis_sections['documents'].append({
|
@@ -575,16 +755,16 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
575 |
'status': 'error',
|
576 |
'message': 'Could not analyze provided documents.',
|
577 |
'details': 'Please ensure documents are in PDF format and are accessible.',
|
578 |
-
'severity': '
|
579 |
'recommendation': 'Please check document format and try again.'
|
580 |
})
|
581 |
else:
|
582 |
analysis_sections['documents'].append({
|
583 |
'check': 'documents_validation',
|
584 |
'status': 'missing',
|
585 |
-
'message': 'Property documents are
|
586 |
'details': 'Please upload relevant property documents in PDF format.',
|
587 |
-
'severity': '
|
588 |
'recommendation': 'Upload property documents in PDF format.'
|
589 |
})
|
590 |
else:
|
@@ -595,7 +775,7 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
595 |
'status': 'error',
|
596 |
'message': f'Error analyzing document: {doc["error"]}',
|
597 |
'details': doc['summary'],
|
598 |
-
'severity': '
|
599 |
'recommendation': 'Please ensure the document is a valid PDF file.'
|
600 |
})
|
601 |
elif doc['authenticity'] != 'verified':
|
@@ -604,14 +784,13 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
604 |
'status': 'unverified',
|
605 |
'message': 'Document authenticity could not be verified.',
|
606 |
'details': doc['summary'],
|
607 |
-
'severity': '
|
608 |
'recommendation': 'Please provide clear, legible documents.'
|
609 |
})
|
610 |
|
611 |
-
#
|
|
|
612 |
if media_analysis['total_images'] == 0:
|
613 |
-
# Check if images were actually provided in the data
|
614 |
-
images = data.get('images', [])
|
615 |
if check_files_exist(images):
|
616 |
# Files exist but couldn't be analyzed
|
617 |
analysis_sections['documents'].append({
|
@@ -619,16 +798,16 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
619 |
'status': 'error',
|
620 |
'message': 'Could not analyze provided images.',
|
621 |
'details': 'Please ensure images are in JPG or PNG format and are accessible.',
|
622 |
-
'severity': '
|
623 |
'recommendation': 'Please check image format and try again.'
|
624 |
})
|
625 |
else:
|
626 |
analysis_sections['documents'].append({
|
627 |
'check': 'images_validation',
|
628 |
'status': 'missing',
|
629 |
-
'message': 'Property images are
|
630 |
'details': 'Please upload at least one image of the property.',
|
631 |
-
'severity': '
|
632 |
'recommendation': 'Upload property images in JPG or PNG format.'
|
633 |
})
|
634 |
else:
|
@@ -639,7 +818,7 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
639 |
'status': 'error',
|
640 |
'message': f'Error analyzing image: {img["error"]}',
|
641 |
'details': img['description'],
|
642 |
-
'severity': '
|
643 |
'recommendation': 'Please ensure the image is in JPG or PNG format.'
|
644 |
})
|
645 |
elif not img['is_property_image']:
|
@@ -648,7 +827,7 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
648 |
'status': 'unverified',
|
649 |
'message': 'Image may not be property-related.',
|
650 |
'details': img['description'],
|
651 |
-
'severity': '
|
652 |
'recommendation': 'Please provide clear property images.'
|
653 |
})
|
654 |
|
@@ -657,17 +836,10 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
657 |
analysis_sections['documents'].append({
|
658 |
'check': 'media_verification_scores',
|
659 |
'status': 'valid',
|
660 |
-
'message': 'Media
|
661 |
-
'details': {
|
662 |
-
'document_verification_score': media_analysis['document_verification_score'],
|
663 |
-
'image_verification_score': media_analysis['image_verification_score'],
|
664 |
-
'total_documents': media_analysis['total_documents'],
|
665 |
-
'total_images': media_analysis['total_images'],
|
666 |
-
'verified_documents': media_analysis['verified_documents'],
|
667 |
-
'verified_images': media_analysis['verified_images']
|
668 |
-
},
|
669 |
'severity': 'low',
|
670 |
-
'recommendation': '
|
671 |
})
|
672 |
|
673 |
# Generate Summary
|
@@ -686,13 +858,16 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
686 |
'inconsistent': 0,
|
687 |
'missing': 0,
|
688 |
'error': 0,
|
689 |
-
'unverified': 0
|
|
|
690 |
},
|
691 |
'fraud_risk_level': 'low',
|
692 |
'media_verification': {
|
693 |
'document_score': media_analysis['document_verification_score'],
|
694 |
'image_score': media_analysis['image_verification_score']
|
695 |
-
}
|
|
|
|
|
696 |
}
|
697 |
|
698 |
# Calculate statistics
|
@@ -703,12 +878,16 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
703 |
if check['status'] in summary['status_counts']:
|
704 |
summary['status_counts'][check['status']] += 1
|
705 |
|
706 |
-
# Calculate fraud risk level
|
707 |
high_severity_issues = summary['severity_counts']['high']
|
708 |
-
|
|
|
|
|
709 |
summary['fraud_risk_level'] = 'high'
|
710 |
-
elif high_severity_issues >
|
711 |
summary['fraud_risk_level'] = 'medium'
|
|
|
|
|
712 |
|
713 |
# Add summary to analysis
|
714 |
analysis_sections['summary'] = [{
|
@@ -720,21 +899,22 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
720 |
'recommendation': f'Fraud Risk Level: {summary["fraud_risk_level"].upper()}. Review all findings and address high severity issues first.'
|
721 |
}]
|
722 |
|
723 |
-
#
|
|
|
724 |
for section_name, checks in analysis_sections.items():
|
725 |
for check in checks:
|
726 |
-
check['
|
727 |
-
|
728 |
|
729 |
-
return
|
730 |
|
731 |
except Exception as e:
|
732 |
-
logger.error(f"Error
|
733 |
return [{
|
734 |
'check': 'cross_validation_error',
|
735 |
'status': 'error',
|
736 |
-
'message': f'
|
737 |
-
'
|
738 |
-
'severity': '
|
739 |
'recommendation': 'Please try again or contact support.'
|
740 |
}]
|
|
|
69 |
'suspicious_patterns': []
|
70 |
}
|
71 |
|
72 |
+
# Check room number consistency - More lenient matching
|
73 |
if 'bedroom' in analysis['room_mentions']:
|
74 |
stated_bedrooms = safe_int_convert(property_data.get('bedrooms', 0))
|
75 |
mentioned_bedrooms = max(analysis['room_mentions']['bedroom'])
|
76 |
+
if stated_bedrooms != mentioned_bedrooms and abs(stated_bedrooms - mentioned_bedrooms) > 1:
|
77 |
analysis['inconsistencies'].append({
|
78 |
'type': 'bedroom_count',
|
79 |
'stated': stated_bedrooms,
|
|
|
84 |
if 'bathroom' in analysis['room_mentions']:
|
85 |
stated_bathrooms = safe_float_convert(property_data.get('bathrooms', 0))
|
86 |
mentioned_bathrooms = max(analysis['room_mentions']['bathroom'])
|
87 |
+
if abs(stated_bathrooms - mentioned_bathrooms) > 1.0: # More lenient for bathrooms
|
88 |
analysis['inconsistencies'].append({
|
89 |
'type': 'bathroom_count',
|
90 |
'stated': stated_bathrooms,
|
|
|
92 |
'message': f'Description mentions {mentioned_bathrooms} bathrooms but listing states {stated_bathrooms} bathrooms.'
|
93 |
})
|
94 |
|
95 |
+
# Check property type consistency - More flexible matching
|
96 |
property_type = property_data.get('property_type', '').lower()
|
97 |
+
if property_type:
|
98 |
+
# Create flexible property type patterns
|
99 |
+
property_type_patterns = {
|
100 |
+
'apartment': ['apartment', 'flat', 'unit', 'condo'],
|
101 |
+
'house': ['house', 'home', 'villa', 'bungalow', 'townhouse'],
|
102 |
+
'plot': ['plot', 'land', 'site'],
|
103 |
+
'commercial': ['commercial', 'office', 'shop', 'retail']
|
104 |
+
}
|
105 |
+
|
106 |
+
# Check if property type is mentioned in description
|
107 |
+
description_lower = description.lower()
|
108 |
+
type_found = False
|
109 |
+
|
110 |
+
for category, patterns in property_type_patterns.items():
|
111 |
+
if property_type in category or any(pattern in property_type for pattern in patterns):
|
112 |
+
if any(pattern in description_lower for pattern in patterns):
|
113 |
+
type_found = True
|
114 |
+
break
|
115 |
+
|
116 |
+
# Only flag if property type is completely missing and description is substantial
|
117 |
+
if not type_found and len(description) > 100:
|
118 |
+
analysis['inconsistencies'].append({
|
119 |
+
'type': 'property_type',
|
120 |
+
'stated': property_type,
|
121 |
+
'message': f'Property type "{property_type}" not mentioned in description.'
|
122 |
+
})
|
123 |
|
124 |
+
# Check for suspicious patterns - More lenient
|
125 |
+
suspicious_keywords = [
|
126 |
+
'urgent sale', 'quick sale', 'no documents needed', 'cash only',
|
127 |
+
'below market', 'distress sale', 'owner abroad', 'inheritance'
|
|
|
|
|
|
|
128 |
]
|
129 |
|
130 |
+
description_lower = description.lower()
|
131 |
+
for keyword in suspicious_keywords:
|
132 |
+
if keyword in description_lower:
|
133 |
analysis['suspicious_patterns'].append({
|
134 |
+
'pattern': keyword,
|
135 |
+
'message': f'Description contains potentially suspicious phrase: "{keyword}"'
|
|
|
136 |
})
|
137 |
|
138 |
return analysis
|
|
|
442 |
return analysis
|
443 |
|
444 |
def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
445 |
+
"""
|
446 |
+
Perform comprehensive cross-validation of property data.
|
447 |
+
"""
|
|
|
448 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
449 |
analysis_sections = {
|
450 |
'basic_info': [],
|
451 |
'location': [],
|
452 |
'specifications': [],
|
453 |
'documents': [],
|
454 |
+
'images': [],
|
455 |
+
'pricing': [],
|
456 |
+
'description': []
|
457 |
}
|
|
|
|
|
|
|
458 |
|
459 |
+
# CRITICAL: Check for obvious fake data patterns first
|
460 |
+
fake_data_detected = False
|
461 |
+
fake_indicators = []
|
462 |
+
|
463 |
+
# Check for numeric-only property names
|
464 |
+
property_name = data.get('property_name', '').strip()
|
465 |
+
if property_name.isdigit() or property_name in ['1', '2', '3', '4', '5']:
|
466 |
+
fake_data_detected = True
|
467 |
+
fake_indicators.append("Property name is just a number")
|
468 |
analysis_sections['basic_info'].append({
|
469 |
+
'check': 'property_name',
|
470 |
+
'status': 'fraudulent',
|
471 |
+
'message': 'Property name is just a number (highly suspicious).',
|
472 |
+
'details': f'Property name: {property_name}',
|
473 |
'severity': 'high',
|
474 |
+
'recommendation': 'Provide a real property name'
|
475 |
+
})
|
476 |
+
|
477 |
+
# Check for suspiciously low values
|
478 |
+
market_value = safe_float_convert(data.get('market_value', 0))
|
479 |
+
if market_value <= 10: # Extremely low threshold
|
480 |
+
fake_data_detected = True
|
481 |
+
fake_indicators.append("Suspiciously low market value")
|
482 |
+
analysis_sections['pricing'].append({
|
483 |
+
'check': 'market_value',
|
484 |
+
'status': 'fraudulent',
|
485 |
+
'message': 'Market value is suspiciously low.',
|
486 |
+
'details': f'Market value: ₹{market_value:,.0f}',
|
487 |
+
'severity': 'high',
|
488 |
+
'recommendation': 'Provide realistic market value'
|
489 |
+
})
|
490 |
+
|
491 |
+
# Check for unrealistic property sizes
|
492 |
+
square_feet = safe_float_convert(data.get('sq_ft', 0))
|
493 |
+
if square_feet <= 10: # Extremely small
|
494 |
+
fake_data_detected = True
|
495 |
+
fake_indicators.append("Unrealistic property size")
|
496 |
+
analysis_sections['specifications'].append({
|
497 |
+
'check': 'square_feet',
|
498 |
+
'status': 'fraudulent',
|
499 |
+
'message': 'Property size is unrealistically small.',
|
500 |
+
'details': f'Square feet: {square_feet}',
|
501 |
+
'severity': 'high',
|
502 |
+
'recommendation': 'Provide realistic property size'
|
503 |
+
})
|
504 |
+
|
505 |
+
# Check for repeated suspicious numbers
|
506 |
+
all_values = [
|
507 |
+
str(data.get('bedrooms', '')),
|
508 |
+
str(data.get('bathrooms', '')),
|
509 |
+
str(data.get('total_rooms', '')),
|
510 |
+
str(data.get('parking', '')),
|
511 |
+
str(data.get('year_built', '')),
|
512 |
+
str(data.get('market_value', '')),
|
513 |
+
str(data.get('sq_ft', ''))
|
514 |
+
]
|
515 |
+
|
516 |
+
numeric_values = [v for v in all_values if v.isdigit()]
|
517 |
+
if len(numeric_values) >= 3:
|
518 |
+
unique_values = set(numeric_values)
|
519 |
+
if len(unique_values) <= 2: # Most values are the same
|
520 |
+
fake_data_detected = True
|
521 |
+
fake_indicators.append("Multiple fields have same suspicious values")
|
522 |
+
analysis_sections['basic_info'].append({
|
523 |
+
'check': 'repeated_values',
|
524 |
+
'status': 'fraudulent',
|
525 |
+
'message': 'Multiple fields contain the same suspicious values.',
|
526 |
+
'details': f'Repeated values: {unique_values}',
|
527 |
+
'severity': 'high',
|
528 |
+
'recommendation': 'Provide realistic and varied property details'
|
529 |
+
})
|
530 |
+
|
531 |
+
# Basic information validation - Handle flat data structure
|
532 |
+
if not property_name or len(property_name) < 3:
|
533 |
+
analysis_sections['basic_info'].append({
|
534 |
+
'check': 'property_name',
|
535 |
+
'status': 'missing',
|
536 |
+
'message': 'Property name is required.',
|
537 |
+
'details': 'Please provide a valid property name.',
|
538 |
+
'severity': 'high' if fake_data_detected else 'medium',
|
539 |
+
'recommendation': 'Provide a valid property name (not just numbers)'
|
540 |
})
|
541 |
|
542 |
+
# Property type validation
|
543 |
+
property_type = data.get('property_type', '').strip()
|
544 |
if not property_type:
|
545 |
analysis_sections['basic_info'].append({
|
546 |
+
'check': 'property_type',
|
547 |
'status': 'missing',
|
548 |
'message': 'Property type is required.',
|
549 |
+
'details': 'Please specify the property type.',
|
550 |
+
'severity': 'high' if fake_data_detected else 'medium',
|
551 |
+
'recommendation': 'Specify property type (apartment, house, etc.)'
|
552 |
})
|
553 |
|
554 |
+
# Status validation
|
555 |
+
status = data.get('status', '').strip()
|
556 |
if not status:
|
557 |
analysis_sections['basic_info'].append({
|
558 |
+
'check': 'status',
|
559 |
'status': 'missing',
|
560 |
'message': 'Property status is required.',
|
561 |
+
'details': 'Please specify if property is for sale or rent.',
|
562 |
+
'severity': 'high' if fake_data_detected else 'medium',
|
563 |
+
'recommendation': 'Specify property status (for sale, for rent, etc.)'
|
564 |
+
})
|
565 |
+
|
566 |
+
# Location validation - Handle flat data structure
|
567 |
+
address = data.get('address', '').strip()
|
568 |
+
city = data.get('city', '').strip()
|
569 |
+
state = data.get('state', '').strip()
|
570 |
+
postal_code = data.get('postal_code', '').strip()
|
571 |
+
|
572 |
+
if not address:
|
573 |
+
analysis_sections['location'].append({
|
574 |
+
'check': 'address',
|
575 |
+
'status': 'missing',
|
576 |
+
'message': 'Property address is required.',
|
577 |
+
'details': 'Please provide the complete property address.',
|
578 |
'severity': 'high',
|
579 |
+
'recommendation': 'Provide complete property address'
|
580 |
})
|
581 |
+
|
582 |
+
if not city:
|
583 |
+
analysis_sections['location'].append({
|
584 |
+
'check': 'city',
|
585 |
+
'status': 'missing',
|
586 |
+
'message': 'City is required.',
|
587 |
+
'details': 'Please specify the city.',
|
|
|
|
|
588 |
'severity': 'high',
|
589 |
+
'recommendation': 'Specify the city'
|
590 |
})
|
591 |
+
|
592 |
+
if not state:
|
|
|
|
|
593 |
analysis_sections['location'].append({
|
594 |
+
'check': 'state',
|
595 |
+
'status': 'missing',
|
596 |
+
'message': 'State is required.',
|
597 |
+
'details': 'Please specify the state.',
|
598 |
'severity': 'high',
|
599 |
+
'recommendation': 'Specify the state'
|
600 |
})
|
601 |
+
|
602 |
+
# Postal code validation - more lenient
|
603 |
+
if postal_code:
|
604 |
+
if not postal_code.isdigit() or len(postal_code) < 5:
|
605 |
+
analysis_sections['location'].append({
|
606 |
+
'check': 'postal_code',
|
607 |
+
'status': 'invalid',
|
608 |
+
'message': 'Invalid postal code format.',
|
609 |
+
'details': f'Postal code: {postal_code}',
|
610 |
+
'severity': 'low',
|
611 |
+
'recommendation': 'Provide a valid postal code'
|
612 |
+
})
|
613 |
+
|
614 |
+
# Specifications validation - Handle flat data structure
|
615 |
+
bedrooms = safe_int_convert(data.get('bedrooms', 0))
|
616 |
+
bathrooms = safe_float_convert(data.get('bathrooms', 0))
|
617 |
+
year_built = safe_int_convert(data.get('year_built', 0))
|
618 |
+
|
619 |
+
# Much stricter validation ranges
|
620 |
+
if bedrooms <= 0 or bedrooms > 20:
|
621 |
analysis_sections['specifications'].append({
|
622 |
+
'check': 'bedrooms',
|
623 |
+
'status': 'fraudulent' if bedrooms <= 0 else 'suspicious',
|
624 |
+
'message': 'Unrealistic number of bedrooms.',
|
625 |
+
'details': f'Bedrooms: {bedrooms}',
|
626 |
+
'severity': 'high' if bedrooms <= 0 else 'medium',
|
627 |
+
'recommendation': 'Provide realistic bedroom count'
|
628 |
})
|
629 |
|
630 |
+
if bathrooms <= 0 or bathrooms > 15:
|
631 |
analysis_sections['specifications'].append({
|
632 |
+
'check': 'bathrooms',
|
633 |
+
'status': 'fraudulent' if bathrooms <= 0 else 'suspicious',
|
634 |
+
'message': 'Unrealistic number of bathrooms.',
|
635 |
+
'details': f'Bathrooms: {bathrooms}',
|
636 |
+
'severity': 'high' if bathrooms <= 0 else 'medium',
|
637 |
+
'recommendation': 'Provide realistic bathroom count'
|
638 |
+
})
|
639 |
+
|
640 |
+
current_year = datetime.now().year
|
641 |
+
if year_built > current_year or year_built < 1800:
|
642 |
+
analysis_sections['specifications'].append({
|
643 |
+
'check': 'year_built',
|
644 |
'status': 'suspicious',
|
645 |
+
'message': 'Unrealistic year built.',
|
646 |
+
'details': f'Year built: {year_built}',
|
647 |
'severity': 'medium',
|
648 |
+
'recommendation': 'Provide realistic year built'
|
649 |
})
|
650 |
+
|
651 |
+
# Pricing validation - Handle flat data structure
|
652 |
+
if market_value <= 0:
|
653 |
+
analysis_sections['pricing'].append({
|
654 |
+
'check': 'market_value',
|
655 |
+
'status': 'missing',
|
656 |
+
'message': 'Market value is required.',
|
657 |
+
'details': 'Please provide the property market value.',
|
658 |
+
'severity': 'high',
|
659 |
+
'recommendation': 'Provide property market value'
|
660 |
+
})
|
661 |
+
elif market_value < 100000: # Minimum reasonable price
|
662 |
+
analysis_sections['pricing'].append({
|
663 |
+
'check': 'market_value',
|
664 |
+
'status': 'fraudulent' if market_value < 10000 else 'suspicious',
|
665 |
+
'message': 'Unusually low market value.',
|
666 |
+
'details': f'Market value: ₹{market_value:,.0f}',
|
667 |
+
'severity': 'high' if market_value < 10000 else 'medium',
|
668 |
+
'recommendation': 'Verify market value is accurate'
|
669 |
+
})
|
670 |
+
|
671 |
+
# Description validation
|
672 |
+
description = data.get('description', '').strip()
|
673 |
if description:
|
674 |
+
# Check for fake description patterns
|
675 |
+
if description.isdigit() or description in ['1', '2', '3', '4', '5']:
|
676 |
+
fake_data_detected = True
|
677 |
+
fake_indicators.append("Description is just a number")
|
678 |
+
analysis_sections['description'].append({
|
679 |
+
'check': 'description',
|
680 |
+
'status': 'fraudulent',
|
681 |
+
'message': 'Description is just a number (highly suspicious).',
|
682 |
+
'details': f'Description: {description}',
|
683 |
'severity': 'high',
|
684 |
+
'recommendation': 'Provide a real property description'
|
685 |
})
|
686 |
+
elif len(description) < 50:
|
687 |
+
analysis_sections['description'].append({
|
688 |
+
'check': 'description',
|
689 |
+
'status': 'insufficient',
|
690 |
+
'message': 'Property description is too short.',
|
691 |
+
'details': f'Description length: {len(description)} characters',
|
692 |
+
'severity': 'medium',
|
693 |
+
'recommendation': 'Provide detailed property description'
|
|
|
694 |
})
|
695 |
+
else:
|
696 |
+
# Create property data dict for description analysis
|
697 |
+
property_data = {
|
698 |
+
'bedrooms': bedrooms,
|
699 |
+
'bathrooms': bathrooms,
|
700 |
+
'property_type': property_type
|
701 |
+
}
|
702 |
+
description_analysis = analyze_property_description(description, property_data)
|
703 |
+
|
704 |
+
for inconsistency in description_analysis['inconsistencies']:
|
705 |
+
analysis_sections['description'].append({
|
706 |
+
'check': f"desc_{inconsistency['type']}",
|
707 |
+
'status': 'inconsistent',
|
708 |
+
'message': inconsistency['message'],
|
709 |
+
'details': f"Stated: {inconsistency.get('stated', 'N/A')}, Mentioned: {inconsistency.get('mentioned', 'N/A')}",
|
710 |
+
'severity': 'low',
|
711 |
+
'recommendation': 'Review and update property description for consistency'
|
712 |
+
})
|
713 |
+
|
714 |
+
for pattern in description_analysis['suspicious_patterns']:
|
715 |
+
analysis_sections['description'].append({
|
716 |
+
'check': 'desc_suspicious_pattern',
|
717 |
+
'status': 'suspicious',
|
718 |
+
'message': pattern['message'],
|
719 |
+
'details': pattern['pattern'],
|
720 |
+
'severity': 'medium',
|
721 |
+
'recommendation': 'Review description for suspicious language'
|
722 |
+
})
|
723 |
+
else:
|
724 |
+
analysis_sections['description'].append({
|
725 |
+
'check': 'description',
|
726 |
+
'status': 'missing',
|
727 |
+
'message': 'Property description is required.',
|
728 |
+
'details': 'Please provide a detailed property description.',
|
729 |
+
'severity': 'high' if fake_data_detected else 'medium',
|
730 |
+
'recommendation': 'Add more detailed property description'
|
731 |
+
})
|
732 |
+
|
733 |
+
# Media analysis - Handle flat data structure
|
734 |
media_analysis = analyze_documents_and_images(data)
|
735 |
|
|
|
736 |
def check_files_exist(files):
|
737 |
+
"""Improved file existence check"""
|
738 |
if not files:
|
739 |
return False
|
740 |
if isinstance(files, str):
|
741 |
files = [files]
|
742 |
+
# Check for actual file content, not just names
|
743 |
+
return any(f and isinstance(f, str) and f.strip() and
|
744 |
+
not f.endswith('×') and
|
745 |
+
(f.endswith('.pdf') or f.endswith('.jpg') or f.endswith('.jpeg') or f.endswith('.png'))
|
746 |
+
for f in files)
|
747 |
+
|
748 |
+
# Document analysis - More lenient
|
749 |
+
documents = data.get('documents', [])
|
750 |
if media_analysis['total_documents'] == 0:
|
|
|
|
|
751 |
if check_files_exist(documents):
|
752 |
# Files exist but couldn't be analyzed
|
753 |
analysis_sections['documents'].append({
|
|
|
755 |
'status': 'error',
|
756 |
'message': 'Could not analyze provided documents.',
|
757 |
'details': 'Please ensure documents are in PDF format and are accessible.',
|
758 |
+
'severity': 'medium',
|
759 |
'recommendation': 'Please check document format and try again.'
|
760 |
})
|
761 |
else:
|
762 |
analysis_sections['documents'].append({
|
763 |
'check': 'documents_validation',
|
764 |
'status': 'missing',
|
765 |
+
'message': 'Property documents are recommended.',
|
766 |
'details': 'Please upload relevant property documents in PDF format.',
|
767 |
+
'severity': 'medium',
|
768 |
'recommendation': 'Upload property documents in PDF format.'
|
769 |
})
|
770 |
else:
|
|
|
775 |
'status': 'error',
|
776 |
'message': f'Error analyzing document: {doc["error"]}',
|
777 |
'details': doc['summary'],
|
778 |
+
'severity': 'medium',
|
779 |
'recommendation': 'Please ensure the document is a valid PDF file.'
|
780 |
})
|
781 |
elif doc['authenticity'] != 'verified':
|
|
|
784 |
'status': 'unverified',
|
785 |
'message': 'Document authenticity could not be verified.',
|
786 |
'details': doc['summary'],
|
787 |
+
'severity': 'low',
|
788 |
'recommendation': 'Please provide clear, legible documents.'
|
789 |
})
|
790 |
|
791 |
+
# Image analysis - More lenient
|
792 |
+
images = data.get('images', [])
|
793 |
if media_analysis['total_images'] == 0:
|
|
|
|
|
794 |
if check_files_exist(images):
|
795 |
# Files exist but couldn't be analyzed
|
796 |
analysis_sections['documents'].append({
|
|
|
798 |
'status': 'error',
|
799 |
'message': 'Could not analyze provided images.',
|
800 |
'details': 'Please ensure images are in JPG or PNG format and are accessible.',
|
801 |
+
'severity': 'medium',
|
802 |
'recommendation': 'Please check image format and try again.'
|
803 |
})
|
804 |
else:
|
805 |
analysis_sections['documents'].append({
|
806 |
'check': 'images_validation',
|
807 |
'status': 'missing',
|
808 |
+
'message': 'Property images are recommended.',
|
809 |
'details': 'Please upload at least one image of the property.',
|
810 |
+
'severity': 'medium',
|
811 |
'recommendation': 'Upload property images in JPG or PNG format.'
|
812 |
})
|
813 |
else:
|
|
|
818 |
'status': 'error',
|
819 |
'message': f'Error analyzing image: {img["error"]}',
|
820 |
'details': img['description'],
|
821 |
+
'severity': 'medium',
|
822 |
'recommendation': 'Please ensure the image is in JPG or PNG format.'
|
823 |
})
|
824 |
elif not img['is_property_image']:
|
|
|
827 |
'status': 'unverified',
|
828 |
'message': 'Image may not be property-related.',
|
829 |
'details': img['description'],
|
830 |
+
'severity': 'low',
|
831 |
'recommendation': 'Please provide clear property images.'
|
832 |
})
|
833 |
|
|
|
836 |
analysis_sections['documents'].append({
|
837 |
'check': 'media_verification_scores',
|
838 |
'status': 'valid',
|
839 |
+
'message': 'Media verification completed.',
|
840 |
+
'details': f'Documents: {media_analysis["total_documents"]}, Images: {media_analysis["total_images"]}',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
841 |
'severity': 'low',
|
842 |
+
'recommendation': 'Media verification successful.'
|
843 |
})
|
844 |
|
845 |
# Generate Summary
|
|
|
858 |
'inconsistent': 0,
|
859 |
'missing': 0,
|
860 |
'error': 0,
|
861 |
+
'unverified': 0,
|
862 |
+
'fraudulent': 0
|
863 |
},
|
864 |
'fraud_risk_level': 'low',
|
865 |
'media_verification': {
|
866 |
'document_score': media_analysis['document_verification_score'],
|
867 |
'image_score': media_analysis['image_verification_score']
|
868 |
+
},
|
869 |
+
'fake_data_detected': fake_data_detected,
|
870 |
+
'fake_indicators': fake_indicators
|
871 |
}
|
872 |
|
873 |
# Calculate statistics
|
|
|
878 |
if check['status'] in summary['status_counts']:
|
879 |
summary['status_counts'][check['status']] += 1
|
880 |
|
881 |
+
# Calculate fraud risk level - Much stricter
|
882 |
high_severity_issues = summary['severity_counts']['high']
|
883 |
+
fraudulent_issues = summary['status_counts']['fraudulent']
|
884 |
+
|
885 |
+
if fake_data_detected or fraudulent_issues > 0 or high_severity_issues > 3:
|
886 |
summary['fraud_risk_level'] = 'high'
|
887 |
+
elif high_severity_issues > 1:
|
888 |
summary['fraud_risk_level'] = 'medium'
|
889 |
+
else:
|
890 |
+
summary['fraud_risk_level'] = 'low'
|
891 |
|
892 |
# Add summary to analysis
|
893 |
analysis_sections['summary'] = [{
|
|
|
899 |
'recommendation': f'Fraud Risk Level: {summary["fraud_risk_level"].upper()}. Review all findings and address high severity issues first.'
|
900 |
}]
|
901 |
|
902 |
+
# Flatten all sections into a single list
|
903 |
+
all_checks = []
|
904 |
for section_name, checks in analysis_sections.items():
|
905 |
for check in checks:
|
906 |
+
check['section'] = section_name
|
907 |
+
all_checks.append(check)
|
908 |
|
909 |
+
return all_checks
|
910 |
|
911 |
except Exception as e:
|
912 |
+
logger.error(f"Error in cross validation: {str(e)}")
|
913 |
return [{
|
914 |
'check': 'cross_validation_error',
|
915 |
'status': 'error',
|
916 |
+
'message': f'Cross validation failed: {str(e)}',
|
917 |
+
'details': 'An error occurred during cross validation.',
|
918 |
+
'severity': 'medium',
|
919 |
'recommendation': 'Please try again or contact support.'
|
920 |
}]
|
models/fraud_classification.py
CHANGED
@@ -1,178 +1,161 @@
|
|
1 |
# models/fraud_classification.py
|
2 |
|
3 |
-
import re
|
4 |
from .model_loader import load_model
|
5 |
from .logging_config import logger
|
|
|
6 |
|
7 |
def classify_fraud(property_details, description):
|
8 |
"""
|
9 |
-
Classify the risk of
|
10 |
-
This function analyzes property details and description to identify potential fraud indicators.
|
11 |
"""
|
12 |
try:
|
13 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
fraud_classification = {
|
15 |
'alert_level': 'minimal',
|
16 |
'alert_score': 0.0,
|
|
|
17 |
'high_risk': [],
|
18 |
'medium_risk': [],
|
19 |
'low_risk': [],
|
20 |
-
'
|
21 |
}
|
22 |
-
|
23 |
-
#
|
24 |
-
if isinstance(property_details, dict):
|
25 |
-
details_str = '\n'.join(f"{k}: {v}" for k, v in property_details.items())
|
26 |
-
else:
|
27 |
-
details_str = str(property_details)
|
28 |
-
text_to_analyze = f"{details_str}\n{description if description else ''}"
|
29 |
-
|
30 |
-
# Define risk categories for zero-shot classification
|
31 |
-
risk_categories = [
|
32 |
-
"fraudulent listing",
|
33 |
-
"misleading information",
|
34 |
-
"fake property",
|
35 |
-
"scam attempt",
|
36 |
-
"legitimate listing"
|
37 |
-
]
|
38 |
-
|
39 |
-
# Perform zero-shot classification with better error handling
|
40 |
-
try:
|
41 |
-
classifier = load_model("zero-shot-classification")
|
42 |
-
if hasattr(classifier, 'task_type') and classifier.task_type == "zero-shot-classification":
|
43 |
-
# Using fallback classifier
|
44 |
-
result = classifier(text_to_analyze, risk_categories)
|
45 |
-
else:
|
46 |
-
# Using actual model
|
47 |
-
result = classifier(text_to_analyze, risk_categories, multi_label=True)
|
48 |
-
except Exception as e:
|
49 |
-
logger.error(f"Model error in fraud classification: {str(e)}")
|
50 |
-
# Use simple keyword-based fallback
|
51 |
-
result = simple_fraud_classification(text_to_analyze, risk_categories)
|
52 |
-
|
53 |
-
# Process classification results
|
54 |
fraud_score = 0.0
|
55 |
if isinstance(result, dict) and 'scores' in result:
|
56 |
for label, score in zip(result.get('labels', []), result.get('scores', [])):
|
57 |
if label != "legitimate listing":
|
58 |
try:
|
59 |
score_val = float(score)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
except Exception:
|
61 |
score_val = 0.0
|
62 |
fraud_score += score_val
|
63 |
fraud_classification['confidence_scores'][label] = score_val
|
64 |
else:
|
65 |
# Handle fallback result
|
66 |
-
fraud_score = 0.
|
67 |
|
68 |
-
# Normalize fraud score to 0-1 range
|
69 |
try:
|
70 |
-
fraud_score = min(1.0, fraud_score / (len(risk_categories) - 1))
|
71 |
except Exception:
|
72 |
fraud_score = 0.0
|
73 |
fraud_classification['alert_score'] = fraud_score
|
74 |
-
|
75 |
-
#
|
76 |
-
|
77 |
-
'
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
r'price.*negotiable|negotiable.*price',
|
84 |
-
r'no.*documents|documents.*not.*required',
|
85 |
-
r'cash.*only|only.*cash',
|
86 |
-
r'off.*market|market.*off',
|
87 |
-
r'under.*table|table.*under'
|
88 |
-
],
|
89 |
-
'medium_risk': [
|
90 |
-
r'unverified|unconfirmed|unchecked',
|
91 |
-
r'partial|incomplete|missing',
|
92 |
-
r'different.*location|location.*different',
|
93 |
-
r'price.*increased|increased.*price',
|
94 |
-
r'no.*photos|photos.*not.*available',
|
95 |
-
r'contact.*email|email.*contact',
|
96 |
-
r'agent.*not.*available|not.*available.*agent',
|
97 |
-
r'property.*not.*viewable|not.*viewable.*property',
|
98 |
-
r'price.*changed|changed.*price',
|
99 |
-
r'details.*updated|updated.*details'
|
100 |
-
],
|
101 |
-
'low_risk': [
|
102 |
-
r'new.*listing|listing.*new',
|
103 |
-
r'recent.*update|update.*recent',
|
104 |
-
r'price.*reduced|reduced.*price',
|
105 |
-
r'contact.*phone|phone.*contact',
|
106 |
-
r'agent.*available|available.*agent',
|
107 |
-
r'property.*viewable|viewable.*property',
|
108 |
-
r'photos.*available|available.*photos',
|
109 |
-
r'documents.*available|available.*documents',
|
110 |
-
r'price.*fixed|fixed.*price',
|
111 |
-
r'details.*complete|complete.*details'
|
112 |
-
]
|
113 |
-
}
|
114 |
-
|
115 |
-
# Check for fraud indicators in text
|
116 |
-
for risk_level, patterns in fraud_indicators.items():
|
117 |
-
for pattern in patterns:
|
118 |
-
try:
|
119 |
-
matches = re.finditer(pattern, text_to_analyze, re.IGNORECASE)
|
120 |
-
for match in matches:
|
121 |
-
indicator = match.group(0)
|
122 |
-
if indicator not in fraud_classification[risk_level]:
|
123 |
-
fraud_classification[risk_level].append(indicator)
|
124 |
-
except Exception as e:
|
125 |
-
logger.warning(f"Regex error in fraud indicator pattern '{pattern}': {str(e)}")
|
126 |
-
|
127 |
-
# Determine alert level based on fraud score and indicators
|
128 |
-
try:
|
129 |
-
if fraud_score > 0.7 or len(fraud_classification['high_risk']) > 0:
|
130 |
-
fraud_classification['alert_level'] = 'critical'
|
131 |
-
elif fraud_score > 0.5 or len(fraud_classification['medium_risk']) > 2:
|
132 |
-
fraud_classification['alert_level'] = 'high'
|
133 |
-
elif fraud_score > 0.3 or len(fraud_classification['medium_risk']) > 0:
|
134 |
-
fraud_classification['alert_level'] = 'medium'
|
135 |
-
elif fraud_score > 0.1 or len(fraud_classification['low_risk']) > 0:
|
136 |
-
fraud_classification['alert_level'] = 'low'
|
137 |
-
else:
|
138 |
-
fraud_classification['alert_level'] = 'minimal'
|
139 |
-
except Exception as e:
|
140 |
-
logger.warning(f"Error determining alert level: {str(e)}")
|
141 |
fraud_classification['alert_level'] = 'minimal'
|
142 |
-
|
143 |
-
#
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
if
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
except Exception as e:
|
164 |
-
logger.warning(f"Error checking price/market_value: {str(e)}")
|
165 |
-
|
166 |
return fraud_classification
|
|
|
167 |
except Exception as e:
|
168 |
logger.error(f"Error in fraud classification: {str(e)}")
|
169 |
return {
|
170 |
-
'alert_level': '
|
171 |
-
'alert_score':
|
172 |
-
'
|
|
|
173 |
'medium_risk': [],
|
174 |
'low_risk': [],
|
175 |
-
'
|
176 |
}
|
177 |
|
178 |
def simple_fraud_classification(text, categories):
|
|
|
1 |
# models/fraud_classification.py
|
2 |
|
|
|
3 |
from .model_loader import load_model
|
4 |
from .logging_config import logger
|
5 |
+
import re
|
6 |
|
7 |
def classify_fraud(property_details, description):
|
8 |
"""
|
9 |
+
Classify the fraud risk of a property listing using AI.
|
|
|
10 |
"""
|
11 |
try:
|
12 |
+
# Combine property details and description for analysis
|
13 |
+
text_to_analyze = f"{property_details} {description}"
|
14 |
+
|
15 |
+
# CRITICAL: Check for obvious fake data patterns first
|
16 |
+
fake_patterns = [
|
17 |
+
r'\b\d+\s*$', # Numbers at end of lines
|
18 |
+
r'^\d+$', # Only numbers
|
19 |
+
r'\b\d{1,2}\s*$', # Single or double digits
|
20 |
+
r'price.*\d{1,3}', # Very low prices
|
21 |
+
r'size.*\d{1,3}', # Very small sizes
|
22 |
+
r'bedrooms.*\d{1,2}', # Very few bedrooms
|
23 |
+
r'bathrooms.*\d{1,2}', # Very few bathrooms
|
24 |
+
]
|
25 |
+
|
26 |
+
fake_detected = False
|
27 |
+
for pattern in fake_patterns:
|
28 |
+
if re.search(pattern, text_to_analyze.lower()):
|
29 |
+
fake_detected = True
|
30 |
+
break
|
31 |
+
|
32 |
+
# Check for repeated numbers (like "2, 2, 2, 2")
|
33 |
+
numbers = re.findall(r'\b\d+\b', text_to_analyze.lower())
|
34 |
+
if len(numbers) >= 3:
|
35 |
+
unique_numbers = set(numbers)
|
36 |
+
if len(unique_numbers) <= 2: # If most numbers are the same
|
37 |
+
fake_detected = True
|
38 |
+
|
39 |
+
# Check for extremely low values
|
40 |
+
if any(word in text_to_analyze.lower() for word in ['₹2', '₹1', '₹3', '₹4', '₹5']):
|
41 |
+
fake_detected = True
|
42 |
+
|
43 |
+
# Check for very small property sizes
|
44 |
+
if any(word in text_to_analyze.lower() for word in ['2 sq ft', '1 sq ft', '3 sq ft', '4 sq ft', '5 sq ft']):
|
45 |
+
fake_detected = True
|
46 |
+
|
47 |
+
# If fake data is detected, return high fraud score immediately
|
48 |
+
if fake_detected:
|
49 |
+
return {
|
50 |
+
'alert_level': 'high',
|
51 |
+
'alert_score': 0.9, # 90% fraud score for fake data
|
52 |
+
'confidence_scores': {
|
53 |
+
'high risk listing': 0.9,
|
54 |
+
'potential fraud': 0.8,
|
55 |
+
'suspicious listing': 0.7,
|
56 |
+
'legitimate listing': 0.1
|
57 |
+
},
|
58 |
+
'high_risk': ['Fake data patterns detected'],
|
59 |
+
'medium_risk': [],
|
60 |
+
'low_risk': [],
|
61 |
+
'reasoning': 'This property was classified as high risk due to detected fake data patterns (repeated numbers, suspiciously low values, unrealistic specifications).'
|
62 |
+
}
|
63 |
+
|
64 |
+
# Use a more lenient classification approach for legitimate-looking data
|
65 |
+
classifier = load_model("zero-shot-classification", "facebook/bart-large-mnli")
|
66 |
+
|
67 |
+
# More balanced risk categories
|
68 |
+
risk_categories = [
|
69 |
+
"legitimate listing",
|
70 |
+
"suspicious listing",
|
71 |
+
"potential fraud",
|
72 |
+
"high risk listing"
|
73 |
+
]
|
74 |
+
|
75 |
+
# Classify the text
|
76 |
+
result = classifier(text_to_analyze[:1000], risk_categories, multi_label=False)
|
77 |
+
|
78 |
fraud_classification = {
|
79 |
'alert_level': 'minimal',
|
80 |
'alert_score': 0.0,
|
81 |
+
'confidence_scores': {},
|
82 |
'high_risk': [],
|
83 |
'medium_risk': [],
|
84 |
'low_risk': [],
|
85 |
+
'reasoning': ''
|
86 |
}
|
87 |
+
|
88 |
+
# Process classification results - More lenient for legitimate data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
fraud_score = 0.0
|
90 |
if isinstance(result, dict) and 'scores' in result:
|
91 |
for label, score in zip(result.get('labels', []), result.get('scores', [])):
|
92 |
if label != "legitimate listing":
|
93 |
try:
|
94 |
score_val = float(score)
|
95 |
+
# Reduce the impact of suspicious classifications
|
96 |
+
if label == "suspicious listing":
|
97 |
+
score_val *= 0.5 # Reduce suspicious impact by 50%
|
98 |
+
elif label == "potential fraud":
|
99 |
+
score_val *= 0.7 # Reduce potential fraud impact by 30%
|
100 |
+
elif label == "high risk listing":
|
101 |
+
score_val *= 0.8 # Reduce high risk impact by 20%
|
102 |
except Exception:
|
103 |
score_val = 0.0
|
104 |
fraud_score += score_val
|
105 |
fraud_classification['confidence_scores'][label] = score_val
|
106 |
else:
|
107 |
# Handle fallback result
|
108 |
+
fraud_score = 0.05 # Reduced from 0.1 to 0.05
|
109 |
|
110 |
+
# Normalize fraud score to 0-1 range with more lenient scaling
|
111 |
try:
|
112 |
+
fraud_score = min(1.0, fraud_score / (len(risk_categories) - 1) * 0.7) # Reduced by 30%
|
113 |
except Exception:
|
114 |
fraud_score = 0.0
|
115 |
fraud_classification['alert_score'] = fraud_score
|
116 |
+
|
117 |
+
# Determine alert level with more lenient thresholds
|
118 |
+
if fraud_score >= 0.7: # Increased from 0.6
|
119 |
+
fraud_classification['alert_level'] = 'high'
|
120 |
+
elif fraud_score >= 0.4: # Increased from 0.3
|
121 |
+
fraud_classification['alert_level'] = 'medium'
|
122 |
+
elif fraud_score >= 0.2: # Increased from 0.1
|
123 |
+
fraud_classification['alert_level'] = 'low'
|
124 |
+
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
fraud_classification['alert_level'] = 'minimal'
|
126 |
+
|
127 |
+
# Generate reasoning based on scores
|
128 |
+
reasoning_parts = []
|
129 |
+
|
130 |
+
if fraud_score < 0.2:
|
131 |
+
reasoning_parts.append("This property was classified as legitimate based on AI analysis of the listing details.")
|
132 |
+
elif fraud_score < 0.4:
|
133 |
+
reasoning_parts.append("This property was classified as low risk based on AI analysis of the listing details.")
|
134 |
+
elif fraud_score < 0.7:
|
135 |
+
reasoning_parts.append("This property was classified as medium risk based on AI analysis of the listing details.")
|
136 |
+
else:
|
137 |
+
reasoning_parts.append("This property was classified as high risk based on AI analysis of the listing details.")
|
138 |
+
|
139 |
+
# Add specific risk indicators if any
|
140 |
+
if fraud_classification['confidence_scores']:
|
141 |
+
highest_risk = max(fraud_classification['confidence_scores'].items(), key=lambda x: x[1])
|
142 |
+
if highest_risk[1] > 0.3:
|
143 |
+
reasoning_parts.append(f"Primary concern: {highest_risk[0]} (confidence: {highest_risk[1]:.0%})")
|
144 |
+
|
145 |
+
fraud_classification['reasoning'] = " ".join(reasoning_parts)
|
146 |
+
|
|
|
|
|
|
|
147 |
return fraud_classification
|
148 |
+
|
149 |
except Exception as e:
|
150 |
logger.error(f"Error in fraud classification: {str(e)}")
|
151 |
return {
|
152 |
+
'alert_level': 'minimal',
|
153 |
+
'alert_score': 0.05, # Reduced from 0.1
|
154 |
+
'confidence_scores': {},
|
155 |
+
'high_risk': [],
|
156 |
'medium_risk': [],
|
157 |
'low_risk': [],
|
158 |
+
'reasoning': f'Fraud analysis failed: {str(e)}'
|
159 |
}
|
160 |
|
161 |
def simple_fraud_classification(text, categories):
|
models/trust_score.py
CHANGED
@@ -2,54 +2,173 @@
|
|
2 |
|
3 |
from .model_loader import load_model
|
4 |
from .logging_config import logger
|
|
|
5 |
|
6 |
def generate_trust_score(text, image_analysis, pdf_analysis):
|
7 |
try:
|
8 |
-
#
|
9 |
-
trust_score =
|
10 |
reasoning_parts = []
|
11 |
|
12 |
# Simple text-based trust indicators
|
13 |
text_lower = str(text).lower()
|
14 |
|
15 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
positive_indicators = [
|
17 |
'verified', 'authentic', 'genuine', 'real', 'legitimate',
|
18 |
-
'complete', 'detailed', 'professional', 'official', 'certified'
|
|
|
|
|
|
|
|
|
19 |
]
|
20 |
|
21 |
-
# Negative indicators
|
22 |
negative_indicators = [
|
23 |
'fake', 'scam', 'fraud', 'suspicious', 'unverified',
|
24 |
-
'incomplete', 'missing', 'unclear', 'doubtful', 'questionable'
|
|
|
|
|
|
|
25 |
]
|
26 |
|
27 |
# Count positive and negative indicators
|
28 |
positive_count = sum(1 for indicator in positive_indicators if indicator in text_lower)
|
29 |
negative_count = sum(1 for indicator in negative_indicators if indicator in text_lower)
|
30 |
|
31 |
-
# Adjust score based on indicators
|
32 |
-
if positive_count > 0:
|
33 |
-
trust_score += min(
|
34 |
reasoning_parts.append(f"Found {positive_count} positive trust indicators")
|
35 |
|
36 |
if negative_count > 0:
|
37 |
-
trust_score -= min(30, negative_count *
|
38 |
reasoning_parts.append(f"Found {negative_count} negative trust indicators")
|
39 |
|
40 |
-
# Image analysis contribution
|
41 |
if image_analysis:
|
42 |
image_count = len(image_analysis) if isinstance(image_analysis, list) else 1
|
43 |
if image_count > 0:
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
-
# PDF analysis contribution
|
48 |
if pdf_analysis:
|
49 |
pdf_count = len(pdf_analysis) if isinstance(pdf_analysis, list) else 1
|
50 |
if pdf_count > 0:
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
# Ensure score is within bounds
|
55 |
trust_score = max(0, min(100, trust_score))
|
@@ -64,4 +183,4 @@ def generate_trust_score(text, image_analysis, pdf_analysis):
|
|
64 |
|
65 |
except Exception as e:
|
66 |
logger.error(f"Error in trust score generation: {str(e)}")
|
67 |
-
return
|
|
|
2 |
|
3 |
from .model_loader import load_model
|
4 |
from .logging_config import logger
|
5 |
+
import re
|
6 |
|
7 |
def generate_trust_score(text, image_analysis, pdf_analysis):
|
8 |
try:
|
9 |
+
# Start with a much lower base score and be very strict
|
10 |
+
trust_score = 20.0 # Drastically reduced from 60.0
|
11 |
reasoning_parts = []
|
12 |
|
13 |
# Simple text-based trust indicators
|
14 |
text_lower = str(text).lower()
|
15 |
|
16 |
+
# CRITICAL: Check for obvious fake data patterns
|
17 |
+
fake_patterns = [
|
18 |
+
r'\b\d+\s*$', # Numbers at end of lines
|
19 |
+
r'^\d+$', # Only numbers
|
20 |
+
r'\b\d{1,2}\s*$', # Single or double digits
|
21 |
+
r'price.*\d{1,3}', # Very low prices
|
22 |
+
r'size.*\d{1,3}', # Very small sizes
|
23 |
+
r'bedrooms.*\d{1,2}', # Very few bedrooms
|
24 |
+
r'bathrooms.*\d{1,2}', # Very few bathrooms
|
25 |
+
]
|
26 |
+
|
27 |
+
fake_detected = False
|
28 |
+
for pattern in fake_patterns:
|
29 |
+
if re.search(pattern, text_lower):
|
30 |
+
fake_detected = True
|
31 |
+
trust_score -= 30 # Heavy penalty for fake patterns
|
32 |
+
reasoning_parts.append("Detected suspicious number patterns")
|
33 |
+
break
|
34 |
+
|
35 |
+
# Check for repeated numbers (like "2, 2, 2, 2")
|
36 |
+
numbers = re.findall(r'\b\d+\b', text_lower)
|
37 |
+
if len(numbers) >= 3:
|
38 |
+
unique_numbers = set(numbers)
|
39 |
+
if len(unique_numbers) <= 2: # If most numbers are the same
|
40 |
+
fake_detected = True
|
41 |
+
trust_score -= 40 # Very heavy penalty
|
42 |
+
reasoning_parts.append("Detected repeated number patterns (likely fake data)")
|
43 |
+
|
44 |
+
# Check for extremely low values
|
45 |
+
if any(word in text_lower for word in ['₹2', '₹1', '₹3', '₹4', '₹5']):
|
46 |
+
fake_detected = True
|
47 |
+
trust_score -= 50 # Extremely heavy penalty
|
48 |
+
reasoning_parts.append("Detected suspiciously low pricing")
|
49 |
+
|
50 |
+
# Check for very small property sizes
|
51 |
+
if any(word in text_lower for word in ['2 sq ft', '1 sq ft', '3 sq ft', '4 sq ft', '5 sq ft']):
|
52 |
+
fake_detected = True
|
53 |
+
trust_score -= 40
|
54 |
+
reasoning_parts.append("Detected unrealistic property size")
|
55 |
+
|
56 |
+
# Check for generic property names
|
57 |
+
if any(word in text_lower for word in ['2', '1', '3', '4', '5']) and len(text.strip()) < 50:
|
58 |
+
fake_detected = True
|
59 |
+
trust_score -= 30
|
60 |
+
reasoning_parts.append("Detected generic/numeric property name")
|
61 |
+
|
62 |
+
# Positive indicators - Much more strict
|
63 |
positive_indicators = [
|
64 |
'verified', 'authentic', 'genuine', 'real', 'legitimate',
|
65 |
+
'complete', 'detailed', 'professional', 'official', 'certified',
|
66 |
+
'luxurious', 'modern', 'spacious', 'well-maintained', 'prime location',
|
67 |
+
'amenities', 'security', 'parking', 'garden', 'balcony',
|
68 |
+
'renovated', 'furnished', 'semi-furnished', 'ready to move',
|
69 |
+
'clear title', 'no litigation', 'approved', 'registered'
|
70 |
]
|
71 |
|
72 |
+
# Negative indicators - More comprehensive
|
73 |
negative_indicators = [
|
74 |
'fake', 'scam', 'fraud', 'suspicious', 'unverified',
|
75 |
+
'incomplete', 'missing', 'unclear', 'doubtful', 'questionable',
|
76 |
+
'urgent sale', 'quick sale', 'no documents needed', 'cash only',
|
77 |
+
'below market', 'distress sale', 'owner abroad', 'inheritance',
|
78 |
+
'unclear title', 'litigation', 'dispute', 'encroachment'
|
79 |
]
|
80 |
|
81 |
# Count positive and negative indicators
|
82 |
positive_count = sum(1 for indicator in positive_indicators if indicator in text_lower)
|
83 |
negative_count = sum(1 for indicator in negative_indicators if indicator in text_lower)
|
84 |
|
85 |
+
# Adjust score based on indicators - Much stricter
|
86 |
+
if positive_count > 0 and not fake_detected:
|
87 |
+
trust_score += min(15, positive_count * 2) # Reduced from 25 to 15
|
88 |
reasoning_parts.append(f"Found {positive_count} positive trust indicators")
|
89 |
|
90 |
if negative_count > 0:
|
91 |
+
trust_score -= min(30, negative_count * 8) # Increased penalty from 20 to 30
|
92 |
reasoning_parts.append(f"Found {negative_count} negative trust indicators")
|
93 |
|
94 |
+
# Image analysis contribution - Much stricter
|
95 |
if image_analysis:
|
96 |
image_count = len(image_analysis) if isinstance(image_analysis, list) else 1
|
97 |
if image_count > 0:
|
98 |
+
# Check if images are actually property-related
|
99 |
+
property_related_count = sum(1 for img in image_analysis if img.get('is_property_related', False))
|
100 |
+
if property_related_count > 0:
|
101 |
+
trust_score += min(10, property_related_count * 3) # Reduced from 20 to 10
|
102 |
+
reasoning_parts.append(f"Property has {property_related_count} property-related images")
|
103 |
+
else:
|
104 |
+
trust_score -= 20 # Penalty for non-property images
|
105 |
+
reasoning_parts.append("No property-related images detected")
|
106 |
+
|
107 |
+
# Bonus for multiple high-quality images
|
108 |
+
if property_related_count >= 3:
|
109 |
+
trust_score += 5
|
110 |
+
reasoning_parts.append("Multiple property images provided")
|
111 |
|
112 |
+
# PDF analysis contribution - Much stricter
|
113 |
if pdf_analysis:
|
114 |
pdf_count = len(pdf_analysis) if isinstance(pdf_analysis, list) else 1
|
115 |
if pdf_count > 0:
|
116 |
+
# Check if documents are actually property-related
|
117 |
+
property_related_docs = sum(1 for doc in pdf_analysis if doc.get('is_property_related', False))
|
118 |
+
if property_related_docs > 0:
|
119 |
+
trust_score += min(10, property_related_docs * 4) # Reduced from 20 to 10
|
120 |
+
reasoning_parts.append(f"Property has {property_related_docs} property-related documents")
|
121 |
+
else:
|
122 |
+
trust_score -= 15 # Penalty for non-property documents
|
123 |
+
reasoning_parts.append("No property-related documents detected")
|
124 |
+
|
125 |
+
# Bonus for multiple documents
|
126 |
+
if property_related_docs >= 2:
|
127 |
+
trust_score += 3
|
128 |
+
reasoning_parts.append("Multiple supporting documents provided")
|
129 |
+
|
130 |
+
# Text quality assessment - Much stricter
|
131 |
+
if text and len(text) > 200 and not fake_detected:
|
132 |
+
trust_score += 8
|
133 |
+
reasoning_parts.append("Detailed property description provided")
|
134 |
+
elif text and len(text) > 100 and not fake_detected:
|
135 |
+
trust_score += 4
|
136 |
+
reasoning_parts.append("Adequate property description provided")
|
137 |
+
elif len(text) < 50:
|
138 |
+
trust_score -= 20 # Heavy penalty for very short descriptions
|
139 |
+
reasoning_parts.append("Very short property description")
|
140 |
+
|
141 |
+
# Location quality assessment - Much stricter
|
142 |
+
if 'hyderabad' in text_lower or 'mumbai' in text_lower or 'delhi' in text_lower or 'bangalore' in text_lower:
|
143 |
+
if not fake_detected:
|
144 |
+
trust_score += 3
|
145 |
+
reasoning_parts.append("Property in major city")
|
146 |
+
|
147 |
+
# Property type assessment - Much stricter
|
148 |
+
if any(prop_type in text_lower for prop_type in ['apartment', 'flat', 'house', 'villa', 'bungalow']):
|
149 |
+
if not fake_detected:
|
150 |
+
trust_score += 2
|
151 |
+
reasoning_parts.append("Clear property type mentioned")
|
152 |
+
|
153 |
+
# Amenities assessment - Much stricter
|
154 |
+
amenities_count = sum(1 for amenity in ['pool', 'gym', 'garden', 'parking', 'security', 'lift', 'balcony']
|
155 |
+
if amenity in text_lower)
|
156 |
+
if amenities_count > 0 and not fake_detected:
|
157 |
+
trust_score += min(5, amenities_count * 1) # Reduced from 10 to 5
|
158 |
+
reasoning_parts.append(f"Property has {amenities_count} amenities mentioned")
|
159 |
+
|
160 |
+
# CRITICAL: Additional fake data checks
|
161 |
+
# Check if all major fields are just numbers
|
162 |
+
numeric_fields = ['property_name', 'bedrooms', 'bathrooms', 'sq_ft', 'market_value']
|
163 |
+
numeric_count = 0
|
164 |
+
for field in numeric_fields:
|
165 |
+
if field in text_lower and re.search(r'\b\d{1,2}\b', text_lower):
|
166 |
+
numeric_count += 1
|
167 |
+
|
168 |
+
if numeric_count >= 3: # If 3+ fields are just numbers
|
169 |
+
fake_detected = True
|
170 |
+
trust_score -= 60 # Extremely heavy penalty
|
171 |
+
reasoning_parts.append("Multiple fields contain only numbers (highly suspicious)")
|
172 |
|
173 |
# Ensure score is within bounds
|
174 |
trust_score = max(0, min(100, trust_score))
|
|
|
183 |
|
184 |
except Exception as e:
|
185 |
logger.error(f"Error in trust score generation: {str(e)}")
|
186 |
+
return 10.0, f"Trust analysis failed: {str(e)}" # Reduced from 50.0 to 10.0
|