Spaces:

EGYADMIN
/

Wahbi-AI

Sleeping

App Files Files Community

EGYADMIN commited on Apr 5

Commit

c9982ee

verified ·

1 Parent(s): ea61973

Update modules/document_analysis/analyzer.py

Browse files

Files changed (1) hide show

modules/document_analysis/analyzer.py +36 -17

modules/document_analysis/analyzer.py CHANGED Viewed

@@ -171,15 +171,27 @@ class DocumentAnalyzer:
             raise
     def _extract_text_from_pdf(self, document_path):
-        """استخراج النص من ملف PDF (تحتاج إلى مكتبة مثل PyPDF2 أو pdfplumber)"""
-        #  Implementation using a PDF processing library like PyPDF2 or pdfplumber is needed here.
-        # This is a placeholder.  Replace with actual PDF text extraction.
-        return "Placeholder text extracted from PDF"
     def _analyze_contract_terms(self, text):
         """تحليل بنود العقد"""
-        #  Implementation for contract term analysis is needed here.  This is a placeholder.
-        return "Placeholder contract terms analysis"
     def _analyze_financial_terms(self, text):
         """تحليل الجزء المالي"""
@@ -447,20 +459,27 @@ class DocumentAnalyzer:
             if img.mode == 'RGBA':
                 img = img.convert('RGB')
-            # تحديد الحجم الأقصى وضغط أكبر
             max_size = (1200, 1200)
-            img.thumbnail(max_size, Image.Resampling.LANCZOS)
-            # ضغط الصورة بجودة أقل للحصول على حجم أصغر
-            buffer = io.BytesIO()
-            img.save(buffer, format='JPEG', quality=60, optimize=True)
-            # التحقق من الحجم والضغط أكثر إذا لزم الأمر
-            if len(buffer.getvalue()) > 5000000:  # 5MB
-                # محاولة ضغط إضافية
-                img.thumbnail((800, 800), Image.Resampling.LANCZOS)
                 buffer = io.BytesIO()
-                img.save(buffer, format='JPEG', quality=40, optimize=True)
             # تحويل الصورة المضغوطة إلى base64
             return base64.b64encode(buffer.getvalue()).decode('utf-8')

             raise
     def _extract_text_from_pdf(self, document_path):
+        """استخراج النص من ملف PDF"""
+        try:
+            import PyPDF2
+            text = ""
+            with open(document_path, 'rb') as file:
+                reader = PyPDF2.PdfReader(file)
+                for page in reader.pages:
+                    text += page.extract_text() + "\n"
+            return text
+        except Exception as e:
+            logger.error(f"خطأ في استخراج النص من PDF: {str(e)}")
+            raise
     def _analyze_contract_terms(self, text):
         """تحليل بنود العقد"""
+        terms = []
+        sections = text.split('\n\n')
+        for section in sections:
+            if any(keyword in section.lower() for keyword in ['شروط', 'بند', 'يلتزم', 'يجب']):
+                terms.append(section.strip())
+        return terms
     def _analyze_financial_terms(self, text):
         """تحليل الجزء المالي"""
             if img.mode == 'RGBA':
                 img = img.convert('RGB')
+            # البدء بجودة عالية وتقليلها تدريجياً حتى نصل للحجم المطلوب
+            quality = 95
             max_size = (1200, 1200)
+            while True:
+                img.thumbnail(max_size, Image.Resampling.LANCZOS)
                 buffer = io.BytesIO()
+                img.save(buffer, format='JPEG', quality=quality, optimize=True)
+                size = len(buffer.getvalue())
+                # إذا كان الحجم أقل من 5 ميجابايت، نخرج من الحلقة
+                if size <= 5000000:
+                    break
+                # تقليل الجودة والحجم
+                quality = max(quality - 10, 20)  # لا نقلل الجودة عن 20
+                max_size = (int(max_size[0] * 0.8), int(max_size[1] * 0.8))
+                # إذا وصلنا للحد الأدنى من الجودة والحجم ولم نصل للحجم المطلوب
+                if quality == 20 and max_size[0] < 400:
+                    raise ValueError("لا يمكن ضغط الصورة للحجم المطلوب")
             # تحويل الصورة المضغوطة إلى base64
             return base64.b64encode(buffer.getvalue()).decode('utf-8')