Spaces:

sksameermujahid
/

propertyverification

Running

App Files Files Community

sksameermujahid commited on Jul 23

Commit

0e5c14c

verified ·

1 Parent(s): 6e3dbdb

Upload 22 files

Browse files

Files changed (4) hide show

app.py +85 -45
models/cross_validation.py +333 -153
models/fraud_classification.py +118 -135
models/trust_score.py +136 -17

app.py CHANGED Viewed

@@ -258,20 +258,48 @@ def calculate_final_verdict(results):
         specs_verification = results.get('specs_verification', {})
         quality_assessment = results.get('quality_assessment', {})
-        # Calculate fraud risk score
         fraud_score = 0.0
         fraud_level = fraud_classification.get('alert_level', 'minimal')
         fraud_alert_score = fraud_classification.get('alert_score', 0.0)
         fraud_score_mapping = {
-            'critical': 1.0,
-            'high': 0.8,
-            'medium': 0.6,
-            'low': 0.3,
-            'minimal': 0.1
         }
         fraud_score = fraud_score_mapping.get(fraud_level, 0.1) * fraud_alert_score
         # Calculate trust score
         trust_score = 0.0
         if isinstance(trust_score_data, dict):
@@ -287,6 +315,10 @@ def calculate_final_verdict(results):
         else:
             trust_score = 0.0
         # Calculate address verification score
         address_score = 0.0
         if address_verification and isinstance(address_verification, dict):
@@ -323,31 +355,16 @@ def calculate_final_verdict(results):
             score = quality_assessment.get('score', 0.0)
             quality_score = float(score) / 100.0 if score > 0 else 0.0
-        # Calculate cross validation issues
-        cross_validation_issues = 0
-        high_severity_issues = 0
-        medium_severity_issues = 0
-        if isinstance(cross_validation, list):
-            cross_validation_issues = len(cross_validation)
-            for issue in cross_validation:
-                if isinstance(issue, dict):
-                    severity = issue.get('severity', 'low')
-                    if severity == 'high':
-                        high_severity_issues += 1
-                    elif severity == 'medium':
-                        medium_severity_issues += 1
-        # Weighted scoring system with improved weights
         weights = {
-            'fraud': 0.30,      # Increased weight for fraud detection
-            'trust': 0.25,      # Increased weight for trust score
             'address': 0.15,    # Address verification
             'location': 0.10,   # Location analysis
             'price': 0.10,      # Price analysis
-            'legal': 0.05,      # Legal analysis
-            'specs': 0.03,      # Specs verification
-            'quality': 0.02     # Quality assessment
         }
         # Calculate weighted score
@@ -366,32 +383,41 @@ def calculate_final_verdict(results):
         logger.info(f"Score components: fraud={fraud_score:.3f}, trust={trust_score:.3f}, address={address_score:.3f}, location={location_score:.3f}, price={price_score:.3f}, legal={legal_score:.3f}, specs={specs_score:.3f}, quality={quality_score:.3f}")
         logger.info(f"Weighted score before penalty: {weighted_score:.3f}")
-        # Adjust score based on cross validation issues
         issue_penalty = 0.0
         if high_severity_issues > 0:
-            issue_penalty += high_severity_issues * 0.08  # Reduced from 0.15 to 0.08 (8% penalty per high severity issue)
         if medium_severity_issues > 0:
-            issue_penalty += medium_severity_issues * 0.04  # Reduced from 0.08 to 0.04 (4% penalty per medium severity issue)
         weighted_score = max(0.0, weighted_score - issue_penalty)
         logger.info(f"Issue penalty: {issue_penalty:.3f}, Final weighted score: {weighted_score:.3f}")
-        # Ensure minimum score for any valid data
-        if any([trust_score > 0, address_score > 0, location_score > 0, price_score > 0]):
-            weighted_score = max(0.15, weighted_score)  # Increased minimum from 0.1 to 0.15 (15% minimum)
-        # Determine verdict and risk level with improved logic
-        if weighted_score >= 0.75 and fraud_score < 0.2 and high_severity_issues == 0:
             verdict = 'VERIFIED REAL ESTATE LISTING'
             risk_level = 'low'
-        elif weighted_score >= 0.60 and fraud_score < 0.4 and high_severity_issues <= 1:
             verdict = 'LIKELY LEGITIMATE'
             risk_level = 'low'
-        elif weighted_score >= 0.40 and fraud_score < 0.6 and high_severity_issues <= 2:
             verdict = 'SUSPICIOUS LISTING'
             risk_level = 'medium'
-        elif fraud_score >= 0.6 or weighted_score < 0.20 or high_severity_issues >= 3:
             verdict = 'HIGH RISK LISTING'
             risk_level = 'high'
         elif weighted_score >= 0.20:
@@ -404,10 +430,16 @@ def calculate_final_verdict(results):
         # Generate detailed reasoning
         reasoning_parts = []
         if fraud_score > 0.3:
             reasoning_parts.append(f"Fraud risk detected (level: {fraud_level})")
-        if trust_score < 0.5:
             reasoning_parts.append(f"Low trust score ({trust_score:.1%})")
         if address_score < 0.5:
@@ -439,12 +471,16 @@ def calculate_final_verdict(results):
         # Ensure score is between 0 and 100
         overall_score = max(0, min(100, overall_score))
-        # Ensure minimum score for any valid data
-        if overall_score == 0 and any([trust_score > 0, address_score > 0, location_score > 0]):
             overall_score = 15  # Minimum 15% score if any component is valid
-        # Final score adjustment based on data quality
-        if high_severity_issues >= 3:
             overall_score = max(10, overall_score)  # Minimum 10% for high risk
         elif high_severity_issues >= 1:
             overall_score = max(15, overall_score)  # Minimum 15% for medium risk
@@ -467,8 +503,12 @@ def calculate_final_verdict(results):
                 'specs_score': specs_score,
                 'quality_score': quality_score,
                 'weighted_score': weighted_score,
-                'cross_validation_issues': cross_validation_issues,
-                'high_severity_issues': high_severity_issues
             }
         }

         specs_verification = results.get('specs_verification', {})
         quality_assessment = results.get('quality_assessment', {})
+        # CRITICAL: Check for fake data patterns in cross validation
+        fake_data_detected = False
+        fraudulent_issues = 0
+        high_severity_issues = 0
+        medium_severity_issues = 0
+        low_severity_issues = 0
+        if isinstance(cross_validation, list):
+            for issue in cross_validation:
+                if isinstance(issue, dict):
+                    status = issue.get('status', '')
+                    severity = issue.get('severity', 'low')
+                    if status == 'fraudulent':
+                        fraudulent_issues += 1
+                        fake_data_detected = True
+                    elif severity == 'high':
+                        high_severity_issues += 1
+                    elif severity == 'medium':
+                        medium_severity_issues += 1
+                    elif severity == 'low':
+                        low_severity_issues += 1
+        # Calculate fraud risk score - Much stricter
         fraud_score = 0.0
         fraud_level = fraud_classification.get('alert_level', 'minimal')
         fraud_alert_score = fraud_classification.get('alert_score', 0.0)
         fraud_score_mapping = {
+            'critical': 1.0,  # Increased back to full penalty
+            'high': 0.8,      # Increased back to full penalty
+            'medium': 0.6,    # Increased back to full penalty
+            'low': 0.4,       # Increased penalty
+            'minimal': 0.1    # Increased penalty
         }
         fraud_score = fraud_score_mapping.get(fraud_level, 0.1) * fraud_alert_score
+        # CRITICAL: Heavy penalty for fake data
+        if fake_data_detected:
+            fraud_score = max(fraud_score, 0.8)  # Minimum 80% fraud score for fake data
+            fraud_level = 'high'
         # Calculate trust score
         trust_score = 0.0
         if isinstance(trust_score_data, dict):
         else:
             trust_score = 0.0
+        # CRITICAL: Heavy penalty for fake data in trust score
+        if fake_data_detected:
+            trust_score = max(0.0, trust_score - 0.5)  # Reduce trust score by 50% for fake data
         # Calculate address verification score
         address_score = 0.0
         if address_verification and isinstance(address_verification, dict):
             score = quality_assessment.get('score', 0.0)
             quality_score = float(score) / 100.0 if score > 0 else 0.0
+        # Much stricter weighted scoring system
         weights = {
+            'fraud': 0.35,      # Increased weight for fraud detection
+            'trust': 0.25,      # Keep trust score important
             'address': 0.15,    # Address verification
             'location': 0.10,   # Location analysis
             'price': 0.10,      # Price analysis
+            'legal': 0.03,      # Legal analysis
+            'specs': 0.01,      # Specs verification
+            'quality': 0.01     # Quality assessment
         }
         # Calculate weighted score
         logger.info(f"Score components: fraud={fraud_score:.3f}, trust={trust_score:.3f}, address={address_score:.3f}, location={location_score:.3f}, price={price_score:.3f}, legal={legal_score:.3f}, specs={specs_score:.3f}, quality={quality_score:.3f}")
         logger.info(f"Weighted score before penalty: {weighted_score:.3f}")
+        # Much stricter penalty system
         issue_penalty = 0.0
+        if fraudulent_issues > 0:
+            issue_penalty += fraudulent_issues * 0.15  # 15% penalty per fraudulent issue
         if high_severity_issues > 0:
+            issue_penalty += high_severity_issues * 0.10  # 10% penalty per high severity issue
         if medium_severity_issues > 0:
+            issue_penalty += medium_severity_issues * 0.05  # 5% penalty per medium severity issue
+        if low_severity_issues > 0:
+            issue_penalty += low_severity_issues * 0.02  # 2% penalty per low severity issue
         weighted_score = max(0.0, weighted_score - issue_penalty)
         logger.info(f"Issue penalty: {issue_penalty:.3f}, Final weighted score: {weighted_score:.3f}")
+        # CRITICAL: Much stricter minimum score requirements
+        if fake_data_detected:
+            weighted_score = max(0.05, weighted_score)  # Maximum 5% score for fake data
+        elif any([trust_score > 0, address_score > 0, location_score > 0, price_score > 0]):
+            weighted_score = max(0.15, weighted_score)  # Minimum 15% for any valid data
+        # Much stricter verdict determination
+        if fake_data_detected or fraudulent_issues > 0:
+            verdict = 'HIGH RISK LISTING'
+            risk_level = 'high'
+        elif weighted_score >= 0.75 and fraud_score < 0.2 and high_severity_issues == 0:
             verdict = 'VERIFIED REAL ESTATE LISTING'
             risk_level = 'low'
+        elif weighted_score >= 0.60 and fraud_score < 0.3 and high_severity_issues <= 1:
             verdict = 'LIKELY LEGITIMATE'
             risk_level = 'low'
+        elif weighted_score >= 0.40 and fraud_score < 0.5 and high_severity_issues <= 2:
             verdict = 'SUSPICIOUS LISTING'
             risk_level = 'medium'
+        elif fraud_score >= 0.5 or weighted_score < 0.20 or high_severity_issues >= 3:
             verdict = 'HIGH RISK LISTING'
             risk_level = 'high'
         elif weighted_score >= 0.20:
         # Generate detailed reasoning
         reasoning_parts = []
+        if fake_data_detected:
+            reasoning_parts.append("Fake data patterns detected")
+        if fraudulent_issues > 0:
+            reasoning_parts.append(f"{fraudulent_issues} fraudulent validation issues")
         if fraud_score > 0.3:
             reasoning_parts.append(f"Fraud risk detected (level: {fraud_level})")
+        if trust_score < 0.3:
             reasoning_parts.append(f"Low trust score ({trust_score:.1%})")
         if address_score < 0.5:
         # Ensure score is between 0 and 100
         overall_score = max(0, min(100, overall_score))
+        # CRITICAL: Much stricter minimum score for fake data
+        if fake_data_detected:
+            overall_score = max(5, min(15, overall_score))  # 5-15% range for fake data
+        elif overall_score == 0 and any([trust_score > 0, address_score > 0, location_score > 0]):
             overall_score = 15  # Minimum 15% score if any component is valid
+        # Final score adjustment based on data quality - Much stricter
+        if fake_data_detected or fraudulent_issues > 0:
+            overall_score = max(5, min(15, overall_score))  # 5-15% for fake/fraudulent data
+        elif high_severity_issues >= 3:
             overall_score = max(10, overall_score)  # Minimum 10% for high risk
         elif high_severity_issues >= 1:
             overall_score = max(15, overall_score)  # Minimum 15% for medium risk
                 'specs_score': specs_score,
                 'quality_score': quality_score,
                 'weighted_score': weighted_score,
+                'cross_validation_issues': len(cross_validation) if isinstance(cross_validation, list) else 0,
+                'high_severity_issues': high_severity_issues,
+                'medium_severity_issues': medium_severity_issues,
+                'low_severity_issues': low_severity_issues,
+                'fraudulent_issues': fraudulent_issues,
+                'fake_data_detected': fake_data_detected
             }
         }

models/cross_validation.py CHANGED Viewed

@@ -69,11 +69,11 @@ def analyze_property_description(description: str, property_data: Dict[str, Any]
         'suspicious_patterns': []
     }
-    # Check room number consistency
     if 'bedroom' in analysis['room_mentions']:
         stated_bedrooms = safe_int_convert(property_data.get('bedrooms', 0))
         mentioned_bedrooms = max(analysis['room_mentions']['bedroom'])
-        if stated_bedrooms != mentioned_bedrooms:
             analysis['inconsistencies'].append({
                 'type': 'bedroom_count',
                 'stated': stated_bedrooms,
@@ -84,7 +84,7 @@ def analyze_property_description(description: str, property_data: Dict[str, Any]
     if 'bathroom' in analysis['room_mentions']:
         stated_bathrooms = safe_float_convert(property_data.get('bathrooms', 0))
         mentioned_bathrooms = max(analysis['room_mentions']['bathroom'])
-        if abs(stated_bathrooms - mentioned_bathrooms) > 0.5:  # Allow for half bathrooms
             analysis['inconsistencies'].append({
                 'type': 'bathroom_count',
                 'stated': stated_bathrooms,
@@ -92,30 +92,47 @@ def analyze_property_description(description: str, property_data: Dict[str, Any]
                 'message': f'Description mentions {mentioned_bathrooms} bathrooms but listing states {stated_bathrooms} bathrooms.'
             })
-    # Check property type consistency
     property_type = property_data.get('property_type', '').lower()
-    if property_type and property_type not in description.lower():
-        analysis['inconsistencies'].append({
-            'type': 'property_type',
-            'stated': property_type,
-            'message': f'Property type "{property_type}" not mentioned in description.'
-        })
-    # Check for suspicious patterns
-    suspicious_patterns = [
-        (r'too good to be true', 'Unrealistic claims'),
-        (r'guaranteed.*return', 'Suspicious return promises'),
-        (r'no.*verification', 'Avoiding verification'),
-        (r'urgent.*sale', 'Pressure tactics'),
-        (r'below.*market', 'Unrealistic pricing')
     ]
-    for pattern, reason in suspicious_patterns:
-        if re.search(pattern, description.lower()):
             analysis['suspicious_patterns'].append({
-                'pattern': pattern,
-                'reason': reason,
-                'message': f'Suspicious pattern detected: {reason}'
             })
     return analysis
@@ -425,149 +442,312 @@ def analyze_documents_and_images(data: Dict[str, Any]) -> Dict[str, Any]:
     return analysis
 def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
-    """Perform comprehensive cross-validation of property data."""
-    cross_checks = []
-    classifier = None
     try:
-        # Load the tiny model for classification with fallback
-        try:
-            classifier = load_model("zero-shot-classification")
-        except Exception as e:
-            logger.warning(f"Could not load classifier for cross validation: {str(e)}")
-            classifier = None
-        # Initialize analysis sections
         analysis_sections = {
             'basic_info': [],
             'location': [],
             'specifications': [],
             'documents': [],
-            'fraud_indicators': []
         }
-        # Process and validate data
-        processed_data = {}
-        # Basic Information Validation
-        property_name = str(data.get('property_name', '')).strip()
-        if not property_name or property_name == '2':
             analysis_sections['basic_info'].append({
-                'check': 'property_name_validation',
-                'status': 'invalid',
-                'message': 'Invalid property name.',
-                'details': 'Please provide a descriptive name for the property.',
                 'severity': 'high',
-                'recommendation': 'Add a proper name for the property.'
             })
-        property_type = str(data.get('property_type', '')).strip()
         if not property_type:
             analysis_sections['basic_info'].append({
-                'check': 'property_type_validation',
                 'status': 'missing',
                 'message': 'Property type is required.',
-                'details': 'Please specify the type of property.',
-                'severity': 'high',
-                'recommendation': 'Select a property type.'
             })
-        status = str(data.get('status', '')).strip()
         if not status:
             analysis_sections['basic_info'].append({
-                'check': 'status_validation',
                 'status': 'missing',
                 'message': 'Property status is required.',
-                'details': 'Please specify if the property is for sale or rent.',
                 'severity': 'high',
-                'recommendation': 'Select the property status.'
             })
-        # Market Value Analysis
-        market_value = safe_float_convert(data.get('market_value', 0))
-        if market_value <= 0:
-            analysis_sections['basic_info'].append({
-                'check': 'market_value_validation',
-                'status': 'invalid',
-                'message': 'Invalid market value.',
-                'details': 'The market value must be a realistic amount.',
                 'severity': 'high',
-                'recommendation': 'Please provide a valid market value.'
             })
-        # Location Analysis
-        location_analysis = analyze_location_consistency(data)
-        for inconsistency in location_analysis['inconsistencies']:
             analysis_sections['location'].append({
-                'check': f'location_{inconsistency["type"]}',
-                'status': 'inconsistent',
-                'message': inconsistency['message'],
-                'details': f'Location data shows inconsistencies: {inconsistency["message"]}',
                 'severity': 'high',
-                'recommendation': 'Please verify the location details.'
             })
-        # Property Specifications Analysis
-        specs_analysis = analyze_property_specifications(data)
-        for inconsistency in specs_analysis['inconsistencies']:
             analysis_sections['specifications'].append({
-                'check': f'specs_{inconsistency["type"]}',
-                'status': 'inconsistent',
-                'message': inconsistency['message'],
-                'details': f'Property specifications show inconsistencies: {inconsistency["message"]}',
-                'severity': 'high',
-                'recommendation': 'Please verify the property specifications.'
             })
-        for suspicious in specs_analysis['suspicious_values']:
             analysis_sections['specifications'].append({
-                'check': f'specs_{suspicious["type"]}',
                 'status': 'suspicious',
-                'message': suspicious['message'],
-                'details': f'Unusual property specification: {suspicious["message"]}',
                 'severity': 'medium',
-                'recommendation': 'Please verify this specification is correct.'
             })
-        # Description Analysis
-        description = str(data.get('description', '')).strip()
         if description:
-            desc_analysis = analyze_property_description(description, data)
-            for inconsistency in desc_analysis['inconsistencies']:
-                analysis_sections['fraud_indicators'].append({
-                    'check': f'desc_{inconsistency["type"]}',
-                    'status': 'inconsistent',
-                    'message': inconsistency['message'],
-                    'details': f'Description shows inconsistencies: {inconsistency["message"]}',
                     'severity': 'high',
-                    'recommendation': 'Please verify the property description.'
                 })
-            for suspicious in desc_analysis['suspicious_patterns']:
-                analysis_sections['fraud_indicators'].append({
-                    'check': f'desc_suspicious_{suspicious["type"]}',
-                    'status': 'suspicious',
-                    'message': suspicious['message'],
-                    'details': f'Suspicious pattern in description: {suspicious["reason"]}',
-                    'severity': 'high',
-                    'recommendation': 'Please review the property description for accuracy.'
                 })
-        # Documents & Images Analysis
         media_analysis = analyze_documents_and_images(data)
-        # Helper function to check if files exist in data
         def check_files_exist(files):
             if not files:
                 return False
             if isinstance(files, str):
                 files = [files]
-            return any(f and isinstance(f, str) and f.strip() and not f.endswith('×') for f in files)
-        # Add document analysis results
         if media_analysis['total_documents'] == 0:
-            # Check if documents were actually provided in the data
-            documents = data.get('documents', [])
             if check_files_exist(documents):
                 # Files exist but couldn't be analyzed
                 analysis_sections['documents'].append({
@@ -575,16 +755,16 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
                     'status': 'error',
                     'message': 'Could not analyze provided documents.',
                     'details': 'Please ensure documents are in PDF format and are accessible.',
-                    'severity': 'high',
                     'recommendation': 'Please check document format and try again.'
                 })
             else:
                 analysis_sections['documents'].append({
                     'check': 'documents_validation',
                     'status': 'missing',
-                    'message': 'Property documents are required.',
                     'details': 'Please upload relevant property documents in PDF format.',
-                    'severity': 'high',
                     'recommendation': 'Upload property documents in PDF format.'
                 })
         else:
@@ -595,7 +775,7 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
                         'status': 'error',
                         'message': f'Error analyzing document: {doc["error"]}',
                         'details': doc['summary'],
-                        'severity': 'high',
                         'recommendation': 'Please ensure the document is a valid PDF file.'
                     })
                 elif doc['authenticity'] != 'verified':
@@ -604,14 +784,13 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
                         'status': 'unverified',
                         'message': 'Document authenticity could not be verified.',
                         'details': doc['summary'],
-                        'severity': 'medium',
                         'recommendation': 'Please provide clear, legible documents.'
                     })
-        # Add image analysis results
         if media_analysis['total_images'] == 0:
-            # Check if images were actually provided in the data
-            images = data.get('images', [])
             if check_files_exist(images):
                 # Files exist but couldn't be analyzed
                 analysis_sections['documents'].append({
@@ -619,16 +798,16 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
                     'status': 'error',
                     'message': 'Could not analyze provided images.',
                     'details': 'Please ensure images are in JPG or PNG format and are accessible.',
-                    'severity': 'high',
                     'recommendation': 'Please check image format and try again.'
                 })
             else:
                 analysis_sections['documents'].append({
                     'check': 'images_validation',
                     'status': 'missing',
-                    'message': 'Property images are required.',
                     'details': 'Please upload at least one image of the property.',
-                    'severity': 'high',
                     'recommendation': 'Upload property images in JPG or PNG format.'
                 })
         else:
@@ -639,7 +818,7 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
                         'status': 'error',
                         'message': f'Error analyzing image: {img["error"]}',
                         'details': img['description'],
-                        'severity': 'high',
                         'recommendation': 'Please ensure the image is in JPG or PNG format.'
                     })
                 elif not img['is_property_image']:
@@ -648,7 +827,7 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
                         'status': 'unverified',
                         'message': 'Image may not be property-related.',
                         'details': img['description'],
-                        'severity': 'medium',
                         'recommendation': 'Please provide clear property images.'
                     })
@@ -657,17 +836,10 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
             analysis_sections['documents'].append({
                 'check': 'media_verification_scores',
                 'status': 'valid',
-                'message': 'Media Verification Scores',
-                'details': {
-                    'document_verification_score': media_analysis['document_verification_score'],
-                    'image_verification_score': media_analysis['image_verification_score'],
-                    'total_documents': media_analysis['total_documents'],
-                    'total_images': media_analysis['total_images'],
-                    'verified_documents': media_analysis['verified_documents'],
-                    'verified_images': media_analysis['verified_images']
-                },
                 'severity': 'low',
-                'recommendation': 'Review media verification scores for property authenticity.'
             })
         # Generate Summary
@@ -686,13 +858,16 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
                 'inconsistent': 0,
                 'missing': 0,
                 'error': 0,
-                'unverified': 0
             },
             'fraud_risk_level': 'low',
             'media_verification': {
                 'document_score': media_analysis['document_verification_score'],
                 'image_score': media_analysis['image_verification_score']
-            }
         }
         # Calculate statistics
@@ -703,12 +878,16 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
                 if check['status'] in summary['status_counts']:
                     summary['status_counts'][check['status']] += 1
-        # Calculate fraud risk level
         high_severity_issues = summary['severity_counts']['high']
-        if high_severity_issues > 5:
             summary['fraud_risk_level'] = 'high'
-        elif high_severity_issues > 2:
             summary['fraud_risk_level'] = 'medium'
         # Add summary to analysis
         analysis_sections['summary'] = [{
@@ -720,21 +899,22 @@ def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
             'recommendation': f'Fraud Risk Level: {summary["fraud_risk_level"].upper()}. Review all findings and address high severity issues first.'
         }]
-        # Convert analysis sections to flat list
         for section_name, checks in analysis_sections.items():
             for check in checks:
-                check['category'] = section_name
-                cross_checks.append(check)
-        return cross_checks
     except Exception as e:
-        logger.error(f"Error performing cross validation: {str(e)}")
         return [{
             'check': 'cross_validation_error',
             'status': 'error',
-            'message': f'Error during validation: {str(e)}',
-            'category': 'System Error',
-            'severity': 'high',
             'recommendation': 'Please try again or contact support.'
         }]

         'suspicious_patterns': []
     }
+    # Check room number consistency - More lenient matching
     if 'bedroom' in analysis['room_mentions']:
         stated_bedrooms = safe_int_convert(property_data.get('bedrooms', 0))
         mentioned_bedrooms = max(analysis['room_mentions']['bedroom'])
+        if stated_bedrooms != mentioned_bedrooms and abs(stated_bedrooms - mentioned_bedrooms) > 1:
             analysis['inconsistencies'].append({
                 'type': 'bedroom_count',
                 'stated': stated_bedrooms,
     if 'bathroom' in analysis['room_mentions']:
         stated_bathrooms = safe_float_convert(property_data.get('bathrooms', 0))
         mentioned_bathrooms = max(analysis['room_mentions']['bathroom'])
+        if abs(stated_bathrooms - mentioned_bathrooms) > 1.0:  # More lenient for bathrooms
             analysis['inconsistencies'].append({
                 'type': 'bathroom_count',
                 'stated': stated_bathrooms,
                 'message': f'Description mentions {mentioned_bathrooms} bathrooms but listing states {stated_bathrooms} bathrooms.'
             })
+    # Check property type consistency - More flexible matching
     property_type = property_data.get('property_type', '').lower()
+    if property_type:
+        # Create flexible property type patterns
+        property_type_patterns = {
+            'apartment': ['apartment', 'flat', 'unit', 'condo'],
+            'house': ['house', 'home', 'villa', 'bungalow', 'townhouse'],
+            'plot': ['plot', 'land', 'site'],
+            'commercial': ['commercial', 'office', 'shop', 'retail']
+        }
+        # Check if property type is mentioned in description
+        description_lower = description.lower()
+        type_found = False
+        for category, patterns in property_type_patterns.items():
+            if property_type in category or any(pattern in property_type for pattern in patterns):
+                if any(pattern in description_lower for pattern in patterns):
+                    type_found = True
+                    break
+        # Only flag if property type is completely missing and description is substantial
+        if not type_found and len(description) > 100:
+            analysis['inconsistencies'].append({
+                'type': 'property_type',
+                'stated': property_type,
+                'message': f'Property type "{property_type}" not mentioned in description.'
+            })
+    # Check for suspicious patterns - More lenient
+    suspicious_keywords = [
+        'urgent sale', 'quick sale', 'no documents needed', 'cash only',
+        'below market', 'distress sale', 'owner abroad', 'inheritance'
     ]
+    description_lower = description.lower()
+    for keyword in suspicious_keywords:
+        if keyword in description_lower:
             analysis['suspicious_patterns'].append({
+                'pattern': keyword,
+                'message': f'Description contains potentially suspicious phrase: "{keyword}"'
             })
     return analysis
     return analysis
 def perform_cross_validation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """
+    Perform comprehensive cross-validation of property data.
+    """
     try:
         analysis_sections = {
             'basic_info': [],
             'location': [],
             'specifications': [],
             'documents': [],
+            'images': [],
+            'pricing': [],
+            'description': []
         }
+        # CRITICAL: Check for obvious fake data patterns first
+        fake_data_detected = False
+        fake_indicators = []
+        # Check for numeric-only property names
+        property_name = data.get('property_name', '').strip()
+        if property_name.isdigit() or property_name in ['1', '2', '3', '4', '5']:
+            fake_data_detected = True
+            fake_indicators.append("Property name is just a number")
             analysis_sections['basic_info'].append({
+                'check': 'property_name',
+                'status': 'fraudulent',
+                'message': 'Property name is just a number (highly suspicious).',
+                'details': f'Property name: {property_name}',
                 'severity': 'high',
+                'recommendation': 'Provide a real property name'
+            })
+        # Check for suspiciously low values
+        market_value = safe_float_convert(data.get('market_value', 0))
+        if market_value <= 10:  # Extremely low threshold
+            fake_data_detected = True
+            fake_indicators.append("Suspiciously low market value")
+            analysis_sections['pricing'].append({
+                'check': 'market_value',
+                'status': 'fraudulent',
+                'message': 'Market value is suspiciously low.',
+                'details': f'Market value: ₹{market_value:,.0f}',
+                'severity': 'high',
+                'recommendation': 'Provide realistic market value'
+            })
+        # Check for unrealistic property sizes
+        square_feet = safe_float_convert(data.get('sq_ft', 0))
+        if square_feet <= 10:  # Extremely small
+            fake_data_detected = True
+            fake_indicators.append("Unrealistic property size")
+            analysis_sections['specifications'].append({
+                'check': 'square_feet',
+                'status': 'fraudulent',
+                'message': 'Property size is unrealistically small.',
+                'details': f'Square feet: {square_feet}',
+                'severity': 'high',
+                'recommendation': 'Provide realistic property size'
+            })
+        # Check for repeated suspicious numbers
+        all_values = [
+            str(data.get('bedrooms', '')),
+            str(data.get('bathrooms', '')),
+            str(data.get('total_rooms', '')),
+            str(data.get('parking', '')),
+            str(data.get('year_built', '')),
+            str(data.get('market_value', '')),
+            str(data.get('sq_ft', ''))
+        ]
+        numeric_values = [v for v in all_values if v.isdigit()]
+        if len(numeric_values) >= 3:
+            unique_values = set(numeric_values)
+            if len(unique_values) <= 2:  # Most values are the same
+                fake_data_detected = True
+                fake_indicators.append("Multiple fields have same suspicious values")
+                analysis_sections['basic_info'].append({
+                    'check': 'repeated_values',
+                    'status': 'fraudulent',
+                    'message': 'Multiple fields contain the same suspicious values.',
+                    'details': f'Repeated values: {unique_values}',
+                    'severity': 'high',
+                    'recommendation': 'Provide realistic and varied property details'
+                })
+        # Basic information validation - Handle flat data structure
+        if not property_name or len(property_name) < 3:
+            analysis_sections['basic_info'].append({
+                'check': 'property_name',
+                'status': 'missing',
+                'message': 'Property name is required.',
+                'details': 'Please provide a valid property name.',
+                'severity': 'high' if fake_data_detected else 'medium',
+                'recommendation': 'Provide a valid property name (not just numbers)'
             })
+        # Property type validation
+        property_type = data.get('property_type', '').strip()
         if not property_type:
             analysis_sections['basic_info'].append({
+                'check': 'property_type',
                 'status': 'missing',
                 'message': 'Property type is required.',
+                'details': 'Please specify the property type.',
+                'severity': 'high' if fake_data_detected else 'medium',
+                'recommendation': 'Specify property type (apartment, house, etc.)'
             })
+        # Status validation
+        status = data.get('status', '').strip()
         if not status:
             analysis_sections['basic_info'].append({
+                'check': 'status',
                 'status': 'missing',
                 'message': 'Property status is required.',
+                'details': 'Please specify if property is for sale or rent.',
+                'severity': 'high' if fake_data_detected else 'medium',
+                'recommendation': 'Specify property status (for sale, for rent, etc.)'
+            })
+        # Location validation - Handle flat data structure
+        address = data.get('address', '').strip()
+        city = data.get('city', '').strip()
+        state = data.get('state', '').strip()
+        postal_code = data.get('postal_code', '').strip()
+        if not address:
+            analysis_sections['location'].append({
+                'check': 'address',
+                'status': 'missing',
+                'message': 'Property address is required.',
+                'details': 'Please provide the complete property address.',
                 'severity': 'high',
+                'recommendation': 'Provide complete property address'
             })
+        if not city:
+            analysis_sections['location'].append({
+                'check': 'city',
+                'status': 'missing',
+                'message': 'City is required.',
+                'details': 'Please specify the city.',
                 'severity': 'high',
+                'recommendation': 'Specify the city'
             })
+        if not state:
             analysis_sections['location'].append({
+                'check': 'state',
+                'status': 'missing',
+                'message': 'State is required.',
+                'details': 'Please specify the state.',
                 'severity': 'high',
+                'recommendation': 'Specify the state'
             })
+        # Postal code validation - more lenient
+        if postal_code:
+            if not postal_code.isdigit() or len(postal_code) < 5:
+                analysis_sections['location'].append({
+                    'check': 'postal_code',
+                    'status': 'invalid',
+                    'message': 'Invalid postal code format.',
+                    'details': f'Postal code: {postal_code}',
+                    'severity': 'low',
+                    'recommendation': 'Provide a valid postal code'
+                })
+        # Specifications validation - Handle flat data structure
+        bedrooms = safe_int_convert(data.get('bedrooms', 0))
+        bathrooms = safe_float_convert(data.get('bathrooms', 0))
+        year_built = safe_int_convert(data.get('year_built', 0))
+        # Much stricter validation ranges
+        if bedrooms <= 0 or bedrooms > 20:
             analysis_sections['specifications'].append({
+                'check': 'bedrooms',
+                'status': 'fraudulent' if bedrooms <= 0 else 'suspicious',
+                'message': 'Unrealistic number of bedrooms.',
+                'details': f'Bedrooms: {bedrooms}',
+                'severity': 'high' if bedrooms <= 0 else 'medium',
+                'recommendation': 'Provide realistic bedroom count'
             })
+        if bathrooms <= 0 or bathrooms > 15:
             analysis_sections['specifications'].append({
+                'check': 'bathrooms',
+                'status': 'fraudulent' if bathrooms <= 0 else 'suspicious',
+                'message': 'Unrealistic number of bathrooms.',
+                'details': f'Bathrooms: {bathrooms}',
+                'severity': 'high' if bathrooms <= 0 else 'medium',
+                'recommendation': 'Provide realistic bathroom count'
+            })
+        current_year = datetime.now().year
+        if year_built > current_year or year_built < 1800:
+            analysis_sections['specifications'].append({
+                'check': 'year_built',
                 'status': 'suspicious',
+                'message': 'Unrealistic year built.',
+                'details': f'Year built: {year_built}',
                 'severity': 'medium',
+                'recommendation': 'Provide realistic year built'
             })
+        # Pricing validation - Handle flat data structure
+        if market_value <= 0:
+            analysis_sections['pricing'].append({
+                'check': 'market_value',
+                'status': 'missing',
+                'message': 'Market value is required.',
+                'details': 'Please provide the property market value.',
+                'severity': 'high',
+                'recommendation': 'Provide property market value'
+            })
+        elif market_value < 100000:  # Minimum reasonable price
+            analysis_sections['pricing'].append({
+                'check': 'market_value',
+                'status': 'fraudulent' if market_value < 10000 else 'suspicious',
+                'message': 'Unusually low market value.',
+                'details': f'Market value: ₹{market_value:,.0f}',
+                'severity': 'high' if market_value < 10000 else 'medium',
+                'recommendation': 'Verify market value is accurate'
+            })
+        # Description validation
+        description = data.get('description', '').strip()
         if description:
+            # Check for fake description patterns
+            if description.isdigit() or description in ['1', '2', '3', '4', '5']:
+                fake_data_detected = True
+                fake_indicators.append("Description is just a number")
+                analysis_sections['description'].append({
+                    'check': 'description',
+                    'status': 'fraudulent',
+                    'message': 'Description is just a number (highly suspicious).',
+                    'details': f'Description: {description}',
                     'severity': 'high',
+                    'recommendation': 'Provide a real property description'
                 })
+            elif len(description) < 50:
+                analysis_sections['description'].append({
+                    'check': 'description',
+                    'status': 'insufficient',
+                    'message': 'Property description is too short.',
+                    'details': f'Description length: {len(description)} characters',
+                    'severity': 'medium',
+                    'recommendation': 'Provide detailed property description'
                 })
+            else:
+                # Create property data dict for description analysis
+                property_data = {
+                    'bedrooms': bedrooms,
+                    'bathrooms': bathrooms,
+                    'property_type': property_type
+                }
+                description_analysis = analyze_property_description(description, property_data)
+                for inconsistency in description_analysis['inconsistencies']:
+                    analysis_sections['description'].append({
+                        'check': f"desc_{inconsistency['type']}",
+                        'status': 'inconsistent',
+                        'message': inconsistency['message'],
+                        'details': f"Stated: {inconsistency.get('stated', 'N/A')}, Mentioned: {inconsistency.get('mentioned', 'N/A')}",
+                        'severity': 'low',
+                        'recommendation': 'Review and update property description for consistency'
+                    })
+                for pattern in description_analysis['suspicious_patterns']:
+                    analysis_sections['description'].append({
+                        'check': 'desc_suspicious_pattern',
+                        'status': 'suspicious',
+                        'message': pattern['message'],
+                        'details': pattern['pattern'],
+                        'severity': 'medium',
+                        'recommendation': 'Review description for suspicious language'
+                    })
+        else:
+            analysis_sections['description'].append({
+                'check': 'description',
+                'status': 'missing',
+                'message': 'Property description is required.',
+                'details': 'Please provide a detailed property description.',
+                'severity': 'high' if fake_data_detected else 'medium',
+                'recommendation': 'Add more detailed property description'
+            })
+        # Media analysis - Handle flat data structure
         media_analysis = analyze_documents_and_images(data)
         def check_files_exist(files):
+            """Improved file existence check"""
             if not files:
                 return False
             if isinstance(files, str):
                 files = [files]
+            # Check for actual file content, not just names
+            return any(f and isinstance(f, str) and f.strip() and
+                      not f.endswith('×') and
+                      (f.endswith('.pdf') or f.endswith('.jpg') or f.endswith('.jpeg') or f.endswith('.png'))
+                      for f in files)
+        # Document analysis - More lenient
+        documents = data.get('documents', [])
         if media_analysis['total_documents'] == 0:
             if check_files_exist(documents):
                 # Files exist but couldn't be analyzed
                 analysis_sections['documents'].append({
                     'status': 'error',
                     'message': 'Could not analyze provided documents.',
                     'details': 'Please ensure documents are in PDF format and are accessible.',
+                    'severity': 'medium',
                     'recommendation': 'Please check document format and try again.'
                 })
             else:
                 analysis_sections['documents'].append({
                     'check': 'documents_validation',
                     'status': 'missing',
+                    'message': 'Property documents are recommended.',
                     'details': 'Please upload relevant property documents in PDF format.',
+                    'severity': 'medium',
                     'recommendation': 'Upload property documents in PDF format.'
                 })
         else:
                         'status': 'error',
                         'message': f'Error analyzing document: {doc["error"]}',
                         'details': doc['summary'],
+                        'severity': 'medium',
                         'recommendation': 'Please ensure the document is a valid PDF file.'
                     })
                 elif doc['authenticity'] != 'verified':
                         'status': 'unverified',
                         'message': 'Document authenticity could not be verified.',
                         'details': doc['summary'],
+                        'severity': 'low',
                         'recommendation': 'Please provide clear, legible documents.'
                     })
+        # Image analysis - More lenient
+        images = data.get('images', [])
         if media_analysis['total_images'] == 0:
             if check_files_exist(images):
                 # Files exist but couldn't be analyzed
                 analysis_sections['documents'].append({
                     'status': 'error',
                     'message': 'Could not analyze provided images.',
                     'details': 'Please ensure images are in JPG or PNG format and are accessible.',
+                    'severity': 'medium',
                     'recommendation': 'Please check image format and try again.'
                 })
             else:
                 analysis_sections['documents'].append({
                     'check': 'images_validation',
                     'status': 'missing',
+                    'message': 'Property images are recommended.',
                     'details': 'Please upload at least one image of the property.',
+                    'severity': 'medium',
                     'recommendation': 'Upload property images in JPG or PNG format.'
                 })
         else:
                         'status': 'error',
                         'message': f'Error analyzing image: {img["error"]}',
                         'details': img['description'],
+                        'severity': 'medium',
                         'recommendation': 'Please ensure the image is in JPG or PNG format.'
                     })
                 elif not img['is_property_image']:
                         'status': 'unverified',
                         'message': 'Image may not be property-related.',
                         'details': img['description'],
+                        'severity': 'low',
                         'recommendation': 'Please provide clear property images.'
                     })
             analysis_sections['documents'].append({
                 'check': 'media_verification_scores',
                 'status': 'valid',
+                'message': 'Media verification completed.',
+                'details': f'Documents: {media_analysis["total_documents"]}, Images: {media_analysis["total_images"]}',
                 'severity': 'low',
+                'recommendation': 'Media verification successful.'
             })
         # Generate Summary
                 'inconsistent': 0,
                 'missing': 0,
                 'error': 0,
+                'unverified': 0,
+                'fraudulent': 0
             },
             'fraud_risk_level': 'low',
             'media_verification': {
                 'document_score': media_analysis['document_verification_score'],
                 'image_score': media_analysis['image_verification_score']
+            },
+            'fake_data_detected': fake_data_detected,
+            'fake_indicators': fake_indicators
         }
         # Calculate statistics
                 if check['status'] in summary['status_counts']:
                     summary['status_counts'][check['status']] += 1
+        # Calculate fraud risk level - Much stricter
         high_severity_issues = summary['severity_counts']['high']
+        fraudulent_issues = summary['status_counts']['fraudulent']
+        if fake_data_detected or fraudulent_issues > 0 or high_severity_issues > 3:
             summary['fraud_risk_level'] = 'high'
+        elif high_severity_issues > 1:
             summary['fraud_risk_level'] = 'medium'
+        else:
+            summary['fraud_risk_level'] = 'low'
         # Add summary to analysis
         analysis_sections['summary'] = [{
             'recommendation': f'Fraud Risk Level: {summary["fraud_risk_level"].upper()}. Review all findings and address high severity issues first.'
         }]
+        # Flatten all sections into a single list
+        all_checks = []
         for section_name, checks in analysis_sections.items():
             for check in checks:
+                check['section'] = section_name
+                all_checks.append(check)
+        return all_checks
     except Exception as e:
+        logger.error(f"Error in cross validation: {str(e)}")
         return [{
             'check': 'cross_validation_error',
             'status': 'error',
+            'message': f'Cross validation failed: {str(e)}',
+            'details': 'An error occurred during cross validation.',
+            'severity': 'medium',
             'recommendation': 'Please try again or contact support.'
         }]

models/fraud_classification.py CHANGED Viewed

@@ -1,178 +1,161 @@
 # models/fraud_classification.py
-import re
 from .model_loader import load_model
 from .logging_config import logger
 def classify_fraud(property_details, description):
     """
-    Classify the risk of fraud in a property listing using zero-shot classification.
-    This function analyzes property details and description to identify potential fraud indicators.
     """
     try:
-        # Initialize fraud classification result
         fraud_classification = {
             'alert_level': 'minimal',
             'alert_score': 0.0,
             'high_risk': [],
             'medium_risk': [],
             'low_risk': [],
-            'confidence_scores': {}
         }
-        # Accept property_details as dict or str
-        if isinstance(property_details, dict):
-            details_str = '\n'.join(f"{k}: {v}" for k, v in property_details.items())
-        else:
-            details_str = str(property_details)
-        text_to_analyze = f"{details_str}\n{description if description else ''}"
-        # Define risk categories for zero-shot classification
-        risk_categories = [
-            "fraudulent listing",
-            "misleading information",
-            "fake property",
-            "scam attempt",
-            "legitimate listing"
-        ]
-        # Perform zero-shot classification with better error handling
-        try:
-            classifier = load_model("zero-shot-classification")
-            if hasattr(classifier, 'task_type') and classifier.task_type == "zero-shot-classification":
-                # Using fallback classifier
-                result = classifier(text_to_analyze, risk_categories)
-            else:
-                # Using actual model
-                result = classifier(text_to_analyze, risk_categories, multi_label=True)
-        except Exception as e:
-            logger.error(f"Model error in fraud classification: {str(e)}")
-            # Use simple keyword-based fallback
-            result = simple_fraud_classification(text_to_analyze, risk_categories)
-        # Process classification results
         fraud_score = 0.0
         if isinstance(result, dict) and 'scores' in result:
             for label, score in zip(result.get('labels', []), result.get('scores', [])):
                 if label != "legitimate listing":
                     try:
                         score_val = float(score)
                     except Exception:
                         score_val = 0.0
                     fraud_score += score_val
                     fraud_classification['confidence_scores'][label] = score_val
         else:
             # Handle fallback result
-            fraud_score = 0.1  # Default low score for fallback
-        # Normalize fraud score to 0-1 range
         try:
-            fraud_score = min(1.0, fraud_score / (len(risk_categories) - 1))
         except Exception:
             fraud_score = 0.0
         fraud_classification['alert_score'] = fraud_score
-        # Define fraud indicators to check
-        fraud_indicators = {
-            'high_risk': [
-                r'urgent|immediate|hurry|limited time|special offer',
-                r'bank|transfer|wire|payment|money',
-                r'fake|scam|fraud|illegal|unauthorized',
-                r'guaranteed|promised|assured|certain',
-                r'contact.*whatsapp|whatsapp.*contact',
-                r'price.*negotiable|negotiable.*price',
-                r'no.*documents|documents.*not.*required',
-                r'cash.*only|only.*cash',
-                r'off.*market|market.*off',
-                r'under.*table|table.*under'
-            ],
-            'medium_risk': [
-                r'unverified|unconfirmed|unchecked',
-                r'partial|incomplete|missing',
-                r'different.*location|location.*different',
-                r'price.*increased|increased.*price',
-                r'no.*photos|photos.*not.*available',
-                r'contact.*email|email.*contact',
-                r'agent.*not.*available|not.*available.*agent',
-                r'property.*not.*viewable|not.*viewable.*property',
-                r'price.*changed|changed.*price',
-                r'details.*updated|updated.*details'
-            ],
-            'low_risk': [
-                r'new.*listing|listing.*new',
-                r'recent.*update|update.*recent',
-                r'price.*reduced|reduced.*price',
-                r'contact.*phone|phone.*contact',
-                r'agent.*available|available.*agent',
-                r'property.*viewable|viewable.*property',
-                r'photos.*available|available.*photos',
-                r'documents.*available|available.*documents',
-                r'price.*fixed|fixed.*price',
-                r'details.*complete|complete.*details'
-            ]
-        }
-        # Check for fraud indicators in text
-        for risk_level, patterns in fraud_indicators.items():
-            for pattern in patterns:
-                try:
-                    matches = re.finditer(pattern, text_to_analyze, re.IGNORECASE)
-                    for match in matches:
-                        indicator = match.group(0)
-                        if indicator not in fraud_classification[risk_level]:
-                            fraud_classification[risk_level].append(indicator)
-                except Exception as e:
-                    logger.warning(f"Regex error in fraud indicator pattern '{pattern}': {str(e)}")
-        # Determine alert level based on fraud score and indicators
-        try:
-            if fraud_score > 0.7 or len(fraud_classification['high_risk']) > 0:
-                fraud_classification['alert_level'] = 'critical'
-            elif fraud_score > 0.5 or len(fraud_classification['medium_risk']) > 2:
-                fraud_classification['alert_level'] = 'high'
-            elif fraud_score > 0.3 or len(fraud_classification['medium_risk']) > 0:
-                fraud_classification['alert_level'] = 'medium'
-            elif fraud_score > 0.1 or len(fraud_classification['low_risk']) > 0:
-                fraud_classification['alert_level'] = 'low'
-            else:
-                fraud_classification['alert_level'] = 'minimal'
-        except Exception as e:
-            logger.warning(f"Error determining alert level: {str(e)}")
             fraud_classification['alert_level'] = 'minimal'
-        # Additional checks for common fraud patterns
-        try:
-            if re.search(r'price.*too.*good|too.*good.*price', text_to_analyze, re.IGNORECASE):
-                fraud_classification['high_risk'].append("Unrealistically low price")
-            if re.search(r'no.*inspection|inspection.*not.*allowed', text_to_analyze, re.IGNORECASE):
-                fraud_classification['high_risk'].append("No property inspection allowed")
-            if re.search(r'owner.*abroad|abroad.*owner', text_to_analyze, re.IGNORECASE):
-                fraud_classification['medium_risk'].append("Owner claims to be abroad")
-            if re.search(r'agent.*unavailable|unavailable.*agent', text_to_analyze, re.IGNORECASE):
-                fraud_classification['medium_risk'].append("Agent unavailable for verification")
-        except Exception as e:
-            logger.warning(f"Error in additional fraud pattern checks: {str(e)}")
-        # Check for inconsistencies in property details
-        try:
-            if isinstance(property_details, dict) and 'price' in property_details and 'market_value' in property_details:
-                price_val = float(str(property_details['price']).replace(',', '').replace('₹', '').strip())
-                market_value_val = float(str(property_details['market_value']).replace(',', '').replace('₹', '').strip())
-                if price_val < market_value_val * 0.5:
-                    fraud_classification['high_risk'].append("Price significantly below market value")
-        except Exception as e:
-            logger.warning(f"Error checking price/market_value: {str(e)}")
         return fraud_classification
     except Exception as e:
         logger.error(f"Error in fraud classification: {str(e)}")
         return {
-            'alert_level': 'error',
-            'alert_score': 1.0,
-            'high_risk': [f"Error in fraud classification: {str(e)}"],
             'medium_risk': [],
             'low_risk': [],
-            'confidence_scores': {}
         }
 def simple_fraud_classification(text, categories):

 # models/fraud_classification.py
 from .model_loader import load_model
 from .logging_config import logger
+import re
 def classify_fraud(property_details, description):
     """
+    Classify the fraud risk of a property listing using AI.
     """
     try:
+        # Combine property details and description for analysis
+        text_to_analyze = f"{property_details} {description}"
+        # CRITICAL: Check for obvious fake data patterns first
+        fake_patterns = [
+            r'\b\d+\s*$',  # Numbers at end of lines
+            r'^\d+$',      # Only numbers
+            r'\b\d{1,2}\s*$',  # Single or double digits
+            r'price.*\d{1,3}',  # Very low prices
+            r'size.*\d{1,3}',   # Very small sizes
+            r'bedrooms.*\d{1,2}',  # Very few bedrooms
+            r'bathrooms.*\d{1,2}', # Very few bathrooms
+        ]
+        fake_detected = False
+        for pattern in fake_patterns:
+            if re.search(pattern, text_to_analyze.lower()):
+                fake_detected = True
+                break
+        # Check for repeated numbers (like "2, 2, 2, 2")
+        numbers = re.findall(r'\b\d+\b', text_to_analyze.lower())
+        if len(numbers) >= 3:
+            unique_numbers = set(numbers)
+            if len(unique_numbers) <= 2:  # If most numbers are the same
+                fake_detected = True
+        # Check for extremely low values
+        if any(word in text_to_analyze.lower() for word in ['₹2', '₹1', '₹3', '₹4', '₹5']):
+            fake_detected = True
+        # Check for very small property sizes
+        if any(word in text_to_analyze.lower() for word in ['2 sq ft', '1 sq ft', '3 sq ft', '4 sq ft', '5 sq ft']):
+            fake_detected = True
+        # If fake data is detected, return high fraud score immediately
+        if fake_detected:
+            return {
+                'alert_level': 'high',
+                'alert_score': 0.9,  # 90% fraud score for fake data
+                'confidence_scores': {
+                    'high risk listing': 0.9,
+                    'potential fraud': 0.8,
+                    'suspicious listing': 0.7,
+                    'legitimate listing': 0.1
+                },
+                'high_risk': ['Fake data patterns detected'],
+                'medium_risk': [],
+                'low_risk': [],
+                'reasoning': 'This property was classified as high risk due to detected fake data patterns (repeated numbers, suspiciously low values, unrealistic specifications).'
+            }
+        # Use a more lenient classification approach for legitimate-looking data
+        classifier = load_model("zero-shot-classification", "facebook/bart-large-mnli")
+        # More balanced risk categories
+        risk_categories = [
+            "legitimate listing",
+            "suspicious listing",
+            "potential fraud",
+            "high risk listing"
+        ]
+        # Classify the text
+        result = classifier(text_to_analyze[:1000], risk_categories, multi_label=False)
         fraud_classification = {
             'alert_level': 'minimal',
             'alert_score': 0.0,
+            'confidence_scores': {},
             'high_risk': [],
             'medium_risk': [],
             'low_risk': [],
+            'reasoning': ''
         }
+        # Process classification results - More lenient for legitimate data
         fraud_score = 0.0
         if isinstance(result, dict) and 'scores' in result:
             for label, score in zip(result.get('labels', []), result.get('scores', [])):
                 if label != "legitimate listing":
                     try:
                         score_val = float(score)
+                        # Reduce the impact of suspicious classifications
+                        if label == "suspicious listing":
+                            score_val *= 0.5  # Reduce suspicious impact by 50%
+                        elif label == "potential fraud":
+                            score_val *= 0.7  # Reduce potential fraud impact by 30%
+                        elif label == "high risk listing":
+                            score_val *= 0.8  # Reduce high risk impact by 20%
                     except Exception:
                         score_val = 0.0
                     fraud_score += score_val
                     fraud_classification['confidence_scores'][label] = score_val
         else:
             # Handle fallback result
+            fraud_score = 0.05  # Reduced from 0.1 to 0.05
+        # Normalize fraud score to 0-1 range with more lenient scaling
         try:
+            fraud_score = min(1.0, fraud_score / (len(risk_categories) - 1) * 0.7)  # Reduced by 30%
         except Exception:
             fraud_score = 0.0
         fraud_classification['alert_score'] = fraud_score
+        # Determine alert level with more lenient thresholds
+        if fraud_score >= 0.7:  # Increased from 0.6
+            fraud_classification['alert_level'] = 'high'
+        elif fraud_score >= 0.4:  # Increased from 0.3
+            fraud_classification['alert_level'] = 'medium'
+        elif fraud_score >= 0.2:  # Increased from 0.1
+            fraud_classification['alert_level'] = 'low'
+        else:
             fraud_classification['alert_level'] = 'minimal'
+        # Generate reasoning based on scores
+        reasoning_parts = []
+        if fraud_score < 0.2:
+            reasoning_parts.append("This property was classified as legitimate based on AI analysis of the listing details.")
+        elif fraud_score < 0.4:
+            reasoning_parts.append("This property was classified as low risk based on AI analysis of the listing details.")
+        elif fraud_score < 0.7:
+            reasoning_parts.append("This property was classified as medium risk based on AI analysis of the listing details.")
+        else:
+            reasoning_parts.append("This property was classified as high risk based on AI analysis of the listing details.")
+        # Add specific risk indicators if any
+        if fraud_classification['confidence_scores']:
+            highest_risk = max(fraud_classification['confidence_scores'].items(), key=lambda x: x[1])
+            if highest_risk[1] > 0.3:
+                reasoning_parts.append(f"Primary concern: {highest_risk[0]} (confidence: {highest_risk[1]:.0%})")
+        fraud_classification['reasoning'] = " ".join(reasoning_parts)
         return fraud_classification
     except Exception as e:
         logger.error(f"Error in fraud classification: {str(e)}")
         return {
+            'alert_level': 'minimal',
+            'alert_score': 0.05,  # Reduced from 0.1
+            'confidence_scores': {},
+            'high_risk': [],
             'medium_risk': [],
             'low_risk': [],
+            'reasoning': f'Fraud analysis failed: {str(e)}'
         }
 def simple_fraud_classification(text, categories):

models/trust_score.py CHANGED Viewed

@@ -2,54 +2,173 @@
 from .model_loader import load_model
 from .logging_config import logger
 def generate_trust_score(text, image_analysis, pdf_analysis):
     try:
-        # Use a simpler approach to avoid timeouts
-        trust_score = 50.0  # Start with neutral score
         reasoning_parts = []
         # Simple text-based trust indicators
         text_lower = str(text).lower()
-        # Positive indicators
         positive_indicators = [
             'verified', 'authentic', 'genuine', 'real', 'legitimate',
-            'complete', 'detailed', 'professional', 'official', 'certified'
         ]
-        # Negative indicators
         negative_indicators = [
             'fake', 'scam', 'fraud', 'suspicious', 'unverified',
-            'incomplete', 'missing', 'unclear', 'doubtful', 'questionable'
         ]
         # Count positive and negative indicators
         positive_count = sum(1 for indicator in positive_indicators if indicator in text_lower)
         negative_count = sum(1 for indicator in negative_indicators if indicator in text_lower)
-        # Adjust score based on indicators
-        if positive_count > 0:
-            trust_score += min(20, positive_count * 5)
             reasoning_parts.append(f"Found {positive_count} positive trust indicators")
         if negative_count > 0:
-            trust_score -= min(30, negative_count * 10)
             reasoning_parts.append(f"Found {negative_count} negative trust indicators")
-        # Image analysis contribution
         if image_analysis:
             image_count = len(image_analysis) if isinstance(image_analysis, list) else 1
             if image_count > 0:
-                trust_score += min(15, image_count * 3)
-                reasoning_parts.append(f"Property has {image_count} images")
-        # PDF analysis contribution
         if pdf_analysis:
             pdf_count = len(pdf_analysis) if isinstance(pdf_analysis, list) else 1
             if pdf_count > 0:
-                trust_score += min(15, pdf_count * 5)
-                reasoning_parts.append(f"Property has {pdf_count} documents")
         # Ensure score is within bounds
         trust_score = max(0, min(100, trust_score))
@@ -64,4 +183,4 @@ def generate_trust_score(text, image_analysis, pdf_analysis):
     except Exception as e:
         logger.error(f"Error in trust score generation: {str(e)}")
-        return 35.0, f"Trust analysis failed: {str(e)}"

 from .model_loader import load_model
 from .logging_config import logger
+import re
 def generate_trust_score(text, image_analysis, pdf_analysis):
     try:
+        # Start with a much lower base score and be very strict
+        trust_score = 20.0  # Drastically reduced from 60.0
         reasoning_parts = []
         # Simple text-based trust indicators
         text_lower = str(text).lower()
+        # CRITICAL: Check for obvious fake data patterns
+        fake_patterns = [
+            r'\b\d+\s*$',  # Numbers at end of lines
+            r'^\d+$',      # Only numbers
+            r'\b\d{1,2}\s*$',  # Single or double digits
+            r'price.*\d{1,3}',  # Very low prices
+            r'size.*\d{1,3}',   # Very small sizes
+            r'bedrooms.*\d{1,2}',  # Very few bedrooms
+            r'bathrooms.*\d{1,2}', # Very few bathrooms
+        ]
+        fake_detected = False
+        for pattern in fake_patterns:
+            if re.search(pattern, text_lower):
+                fake_detected = True
+                trust_score -= 30  # Heavy penalty for fake patterns
+                reasoning_parts.append("Detected suspicious number patterns")
+                break
+        # Check for repeated numbers (like "2, 2, 2, 2")
+        numbers = re.findall(r'\b\d+\b', text_lower)
+        if len(numbers) >= 3:
+            unique_numbers = set(numbers)
+            if len(unique_numbers) <= 2:  # If most numbers are the same
+                fake_detected = True
+                trust_score -= 40  # Very heavy penalty
+                reasoning_parts.append("Detected repeated number patterns (likely fake data)")
+        # Check for extremely low values
+        if any(word in text_lower for word in ['₹2', '₹1', '₹3', '₹4', '₹5']):
+            fake_detected = True
+            trust_score -= 50  # Extremely heavy penalty
+            reasoning_parts.append("Detected suspiciously low pricing")
+        # Check for very small property sizes
+        if any(word in text_lower for word in ['2 sq ft', '1 sq ft', '3 sq ft', '4 sq ft', '5 sq ft']):
+            fake_detected = True
+            trust_score -= 40
+            reasoning_parts.append("Detected unrealistic property size")
+        # Check for generic property names
+        if any(word in text_lower for word in ['2', '1', '3', '4', '5']) and len(text.strip()) < 50:
+            fake_detected = True
+            trust_score -= 30
+            reasoning_parts.append("Detected generic/numeric property name")
+        # Positive indicators - Much more strict
         positive_indicators = [
             'verified', 'authentic', 'genuine', 'real', 'legitimate',
+            'complete', 'detailed', 'professional', 'official', 'certified',
+            'luxurious', 'modern', 'spacious', 'well-maintained', 'prime location',
+            'amenities', 'security', 'parking', 'garden', 'balcony',
+            'renovated', 'furnished', 'semi-furnished', 'ready to move',
+            'clear title', 'no litigation', 'approved', 'registered'
         ]
+        # Negative indicators - More comprehensive
         negative_indicators = [
             'fake', 'scam', 'fraud', 'suspicious', 'unverified',
+            'incomplete', 'missing', 'unclear', 'doubtful', 'questionable',
+            'urgent sale', 'quick sale', 'no documents needed', 'cash only',
+            'below market', 'distress sale', 'owner abroad', 'inheritance',
+            'unclear title', 'litigation', 'dispute', 'encroachment'
         ]
         # Count positive and negative indicators
         positive_count = sum(1 for indicator in positive_indicators if indicator in text_lower)
         negative_count = sum(1 for indicator in negative_indicators if indicator in text_lower)
+        # Adjust score based on indicators - Much stricter
+        if positive_count > 0 and not fake_detected:
+            trust_score += min(15, positive_count * 2)  # Reduced from 25 to 15
             reasoning_parts.append(f"Found {positive_count} positive trust indicators")
         if negative_count > 0:
+            trust_score -= min(30, negative_count * 8)  # Increased penalty from 20 to 30
             reasoning_parts.append(f"Found {negative_count} negative trust indicators")
+        # Image analysis contribution - Much stricter
         if image_analysis:
             image_count = len(image_analysis) if isinstance(image_analysis, list) else 1
             if image_count > 0:
+                # Check if images are actually property-related
+                property_related_count = sum(1 for img in image_analysis if img.get('is_property_related', False))
+                if property_related_count > 0:
+                    trust_score += min(10, property_related_count * 3)  # Reduced from 20 to 10
+                    reasoning_parts.append(f"Property has {property_related_count} property-related images")
+                else:
+                    trust_score -= 20  # Penalty for non-property images
+                    reasoning_parts.append("No property-related images detected")
+                # Bonus for multiple high-quality images
+                if property_related_count >= 3:
+                    trust_score += 5
+                    reasoning_parts.append("Multiple property images provided")
+        # PDF analysis contribution - Much stricter
         if pdf_analysis:
             pdf_count = len(pdf_analysis) if isinstance(pdf_analysis, list) else 1
             if pdf_count > 0:
+                # Check if documents are actually property-related
+                property_related_docs = sum(1 for doc in pdf_analysis if doc.get('is_property_related', False))
+                if property_related_docs > 0:
+                    trust_score += min(10, property_related_docs * 4)  # Reduced from 20 to 10
+                    reasoning_parts.append(f"Property has {property_related_docs} property-related documents")
+                else:
+                    trust_score -= 15  # Penalty for non-property documents
+                    reasoning_parts.append("No property-related documents detected")
+                # Bonus for multiple documents
+                if property_related_docs >= 2:
+                    trust_score += 3
+                    reasoning_parts.append("Multiple supporting documents provided")
+        # Text quality assessment - Much stricter
+        if text and len(text) > 200 and not fake_detected:
+            trust_score += 8
+            reasoning_parts.append("Detailed property description provided")
+        elif text and len(text) > 100 and not fake_detected:
+            trust_score += 4
+            reasoning_parts.append("Adequate property description provided")
+        elif len(text) < 50:
+            trust_score -= 20  # Heavy penalty for very short descriptions
+            reasoning_parts.append("Very short property description")
+        # Location quality assessment - Much stricter
+        if 'hyderabad' in text_lower or 'mumbai' in text_lower or 'delhi' in text_lower or 'bangalore' in text_lower:
+            if not fake_detected:
+                trust_score += 3
+                reasoning_parts.append("Property in major city")
+        # Property type assessment - Much stricter
+        if any(prop_type in text_lower for prop_type in ['apartment', 'flat', 'house', 'villa', 'bungalow']):
+            if not fake_detected:
+                trust_score += 2
+                reasoning_parts.append("Clear property type mentioned")
+        # Amenities assessment - Much stricter
+        amenities_count = sum(1 for amenity in ['pool', 'gym', 'garden', 'parking', 'security', 'lift', 'balcony']
+                             if amenity in text_lower)
+        if amenities_count > 0 and not fake_detected:
+            trust_score += min(5, amenities_count * 1)  # Reduced from 10 to 5
+            reasoning_parts.append(f"Property has {amenities_count} amenities mentioned")
+        # CRITICAL: Additional fake data checks
+        # Check if all major fields are just numbers
+        numeric_fields = ['property_name', 'bedrooms', 'bathrooms', 'sq_ft', 'market_value']
+        numeric_count = 0
+        for field in numeric_fields:
+            if field in text_lower and re.search(r'\b\d{1,2}\b', text_lower):
+                numeric_count += 1
+        if numeric_count >= 3:  # If 3+ fields are just numbers
+            fake_detected = True
+            trust_score -= 60  # Extremely heavy penalty
+            reasoning_parts.append("Multiple fields contain only numbers (highly suspicious)")
         # Ensure score is within bounds
         trust_score = max(0, min(100, trust_score))
     except Exception as e:
         logger.error(f"Error in trust score generation: {str(e)}")
+        return 10.0, f"Trust analysis failed: {str(e)}"  # Reduced from 50.0 to 10.0