Spaces:

EGYADMIN
/

Wahbi-AI

Sleeping

App Files Files Community

EGYADMIN commited on Apr 5

Commit

ea61973

verified ·

1 Parent(s): d21d576

Update modules/document_analysis/analyzer.py

Browse files

Files changed (1) hide show

modules/document_analysis/analyzer.py +92 -50

modules/document_analysis/analyzer.py CHANGED Viewed

@@ -9,6 +9,10 @@ import threading
 from pathlib import Path
 import datetime
 import json
 # تهيئة السجل
 logging.basicConfig(
@@ -19,33 +23,33 @@ logger = logging.getLogger('document_analysis')
 class DocumentAnalyzer:
     """فئة تحليل المستندات"""
     def __init__(self, config=None):
         """تهيئة محلل المستندات"""
         self.config = config
         self.analysis_in_progress = False
         self.current_document = None
         self.analysis_results = {}
         # إنشاء مجلد المستندات إذا لم يكن موجوداً
         if config and hasattr(config, 'DOCUMENTS_PATH'):
             self.documents_path = Path(config.DOCUMENTS_PATH)
         else:
             self.documents_path = Path('data/documents')
         if not self.documents_path.exists():
             self.documents_path.mkdir(parents=True, exist_ok=True)
     def analyze_document(self, document_path, document_type="tender", callback=None):
         """تحليل مستند"""
         if self.analysis_in_progress:
             logger.warning("هناك عملية تحليل جارية بالفعل")
             return False
         if not os.path.exists(document_path):
             logger.error(f"المستند غير موجود: {document_path}")
             return False
         self.analysis_in_progress = True
         self.current_document = document_path
         self.analysis_results = {
@@ -59,7 +63,7 @@ class DocumentAnalyzer:
             "amounts": [],
             "risks": []
         }
         # بدء التحليل في خيط منفصل
         thread = threading.Thread(
             target=self._analyze_document_thread,
@@ -67,15 +71,15 @@ class DocumentAnalyzer:
         )
         thread.daemon = True
         thread.start()
         return True
     def _analyze_document_thread(self, document_path, document_type, callback):
         """خيط تحليل المستند"""
         try:
             # تحديد نوع المستند
             file_extension = os.path.splitext(document_path)[1].lower()
             if file_extension == '.pdf':
                 self.analysis_results = self._analyze_pdf(document_path, document_type)
             elif file_extension == '.docx':
@@ -88,32 +92,32 @@ class DocumentAnalyzer:
                 logger.error(f"نوع المستند غير مدعوم: {file_extension}")
                 self.analysis_results["status"] = "فشل التحليل"
                 self.analysis_results["error"] = "نوع المستند غير مدعوم"
             # تحديث حالة التحليل
             if self.analysis_results["status"] != "فشل التحليل":
                 self.analysis_results["status"] = "اكتمل التحليل"
                 self.analysis_results["analysis_end_time"] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
             logger.info(f"اكتمل تحليل المستند: {document_path}")
         except Exception as e:
             logger.error(f"خطأ في تحليل المستند: {str(e)}")
             self.analysis_results["status"] = "فشل التحليل"
             self.analysis_results["error"] = str(e)
         finally:
             self.analysis_in_progress = False
             # استدعاء دالة الاستجابة إذا تم توفيرها
             if callback and callable(callback):
                 callback(self.analysis_results)
     def _analyze_pdf(self, document_path, document_type):
         """تحليل مستند PDF باستخدام الذكاء الاصطناعي"""
         try:
             # استخراج النص من PDF
             text = self._extract_text_from_pdf(document_path)
             # تحليل متقدم للمستند
             analysis = {
                 "file_info": {
@@ -214,10 +218,10 @@ class DocumentAnalyzer:
         try:
             # محاكاة تحليل مستند Word
             logger.info(f"تحليل مستند Word: {document_path}")
             # في التطبيق الفعلي، سيتم استخدام مكتبة مثل python-docx
             # لاستخراج النص من ملف Word وتحليله
             # محاكاة استخراج البنود والكيانات والتواريخ والمبالغ والمخاطر
             # (مشابه لتحليل PDF)
             self.analysis_results["items"] = [
@@ -225,56 +229,56 @@ class DocumentAnalyzer:
                 {"id": 2, "name": "تركيب المعدات", "description": "تركيب وتشغيل المعدات", "unit": "مجموعة", "estimated_quantity": 10},
                 {"id": 3, "name": "التدريب", "description": "تدريب الموظفين على استخدام المعدات", "unit": "يوم", "estimated_quantity": 20}
             ]
             # محاكاة استخراج الكيانات والتواريخ والمبالغ والمخاطر
             # (مشابه لتحليل PDF)
         except Exception as e:
             logger.error(f"خطأ في تحليل مستند Word: {str(e)}")
             raise
     def _analyze_xlsx(self, document_path, document_type):
         """تحليل مستند Excel"""
         try:
             # محاكاة تحليل مستند Excel
             logger.info(f"تحليل مستند Excel: {document_path}")
             # في التطبيق الفعلي، سيتم استخدام مكتبة مثل pandas أو openpyxl
             # لاستخراج البيانات من ملف Excel وتحليلها
             # محاكاة استخراج البنود
             self.analysis_results["items"] = [
                 {"id": 1, "name": "بند 1", "description": "وصف البند 1", "unit": "وحدة", "estimated_quantity": 100},
                 {"id": 2, "name": "بند 2", "description": "وصف البند 2", "unit": "وحدة", "estimated_quantity": 200},
                 {"id": 3, "name": "بند 3", "description": "وصف البند 3", "unit": "وحدة", "estimated_quantity": 300}
             ]
             # محاكاة استخراج المبالغ
             self.analysis_results["amounts"] = [
                 {"type": "item_cost", "amount": 10000, "currency": "SAR", "description": "تكلفة البند 1"},
                 {"type": "item_cost", "amount": 20000, "currency": "SAR", "description": "تكلفة البند 2"},
                 {"type": "item_cost", "amount": 30000, "currency": "SAR", "description": "تكلفة البند 3"}
             ]
         except Exception as e:
             logger.error(f"خطأ في تحليل مستند Excel: {str(e)}")
             raise
     def _analyze_txt(self, document_path, document_type):
         """تحليل مستند نصي"""
         try:
             # محاكاة تحليل مستند نصي
             logger.info(f"تحليل مستند نصي: {document_path}")
             # في التطبيق الفعلي، سيتم قراءة الملف النصي وتحليله
             # محاكاة استخراج البنود والكيانات والتواريخ والمبالغ والمخاطر
-            # (مشابه للتحليلات الأخرى)
         except Exception as e:
             logger.error(f"خطأ في تحليل مستند نصي: {str(e)}")
             raise
     def get_analysis_status(self):
         """الحصول على حالة التحليل الحالي"""
         if not self.analysis_in_progress:
@@ -282,53 +286,53 @@ class DocumentAnalyzer:
                 return {"status": "لا يوجد تحليل جارٍ"}
             else:
                 return {"status": self.analysis_results.get("status", "غير معروف")}
         return {
             "status": "جاري التحليل",
             "document_path": self.current_document,
             "start_time": self.analysis_results.get("analysis_start_time")
         }
     def get_analysis_results(self):
         """الحصول على نتائج التحليل"""
         return self.analysis_results
     def export_analysis_results(self, output_path=None):
         """تصدير نتائج التحليل إلى ملف JSON"""
         if not self.analysis_results:
             logger.warning("لا توجد نتائج تحليل للتصدير")
             return None
         if not output_path:
             # إنشاء اسم ملف افتراضي
             timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
             filename = f"analysis_results_{timestamp}.json"
             output_path = os.path.join(self.documents_path, filename)
         try:
             with open(output_path, 'w', encoding='utf-8') as f:
                 json.dump(self.analysis_results, f, ensure_ascii=False, indent=4)
             logger.info(f"تم تصدير نتائج التحليل إلى: {output_path}")
             return output_path
         except Exception as e:
             logger.error(f"خطأ في تصدير نتائج التحليل: {str(e)}")
             return None
     def import_analysis_results(self, input_path):
         """استيراد نتائج التحليل من ملف JSON"""
         if not os.path.exists(input_path):
             logger.error(f"ملف نتائج التحليل غير موجود: {input_path}")
             return False
         try:
             with open(input_path, 'r', encoding='utf-8') as f:
                 self.analysis_results = json.load(f)
             logger.info(f"تم استيراد نتائج التحليل من: {input_path}")
             return True
         except Exception as e:
             logger.error(f"خطأ في استيراد نتائج التحليل: {str(e)}")
             return False
@@ -377,11 +381,11 @@ class DocumentAnalyzer:
         # البحث عن القيم النقدية
         currency_pattern = r'[\d,]+\.?\d*\s*(?:ريال|دولار|SAR|USD)'
         currencies = re.findall(currency_pattern, text)
         # البحث عن النسب المئوية
         percentage_pattern = r'\d+\.?\d*\s*%'
         percentages = re.findall(percentage_pattern, text)
         return {
             "currencies": currencies,
             "percentages": percentages
@@ -398,7 +402,7 @@ class DocumentAnalyzer:
         avg_word_length = sum(len(word) for word in words) / len(words)
         sentences = text.split('.')
         avg_sentence_length = len(words) / len(sentences)
         # حساب درجة التعقيد (1-10)
         complexity = min((avg_word_length * 0.5 + avg_sentence_length * 0.2), 10)
         return round(complexity, 2)
@@ -413,12 +417,12 @@ class DocumentAnalyzer:
             "الغرامات",
             "شروط الدفع"
         ]
         missing = []
         for section in required_sections:
             if section not in text:
                 missing.append(section)
         return missing
     def _find_related_documents(self, document_path):
@@ -426,9 +430,47 @@ class DocumentAnalyzer:
         directory = os.path.dirname(document_path)
         base_name = os.path.basename(document_path)
         related = []
         for file in os.listdir(directory):
             if file != base_name and file.startswith(base_name.split('_')[0]):
                 related.append(file)
         return related

 from pathlib import Path
 import datetime
 import json
+import base64
+import time
+from PIL import Image
+import io
 # تهيئة السجل
 logging.basicConfig(
 class DocumentAnalyzer:
     """فئة تحليل المستندات"""
     def __init__(self, config=None):
         """تهيئة محلل المستندات"""
         self.config = config
         self.analysis_in_progress = False
         self.current_document = None
         self.analysis_results = {}
         # إنشاء مجلد المستندات إذا لم يكن موجوداً
         if config and hasattr(config, 'DOCUMENTS_PATH'):
             self.documents_path = Path(config.DOCUMENTS_PATH)
         else:
             self.documents_path = Path('data/documents')
         if not self.documents_path.exists():
             self.documents_path.mkdir(parents=True, exist_ok=True)
     def analyze_document(self, document_path, document_type="tender", callback=None):
         """تحليل مستند"""
         if self.analysis_in_progress:
             logger.warning("هناك عملية تحليل جارية بالفعل")
             return False
         if not os.path.exists(document_path):
             logger.error(f"المستند غير موجود: {document_path}")
             return False
         self.analysis_in_progress = True
         self.current_document = document_path
         self.analysis_results = {
             "amounts": [],
             "risks": []
         }
         # بدء التحليل في خيط منفصل
         thread = threading.Thread(
             target=self._analyze_document_thread,
         )
         thread.daemon = True
         thread.start()
         return True
     def _analyze_document_thread(self, document_path, document_type, callback):
         """خيط تحليل المستند"""
         try:
             # تحديد نوع المستند
             file_extension = os.path.splitext(document_path)[1].lower()
             if file_extension == '.pdf':
                 self.analysis_results = self._analyze_pdf(document_path, document_type)
             elif file_extension == '.docx':
                 logger.error(f"نوع المستند غير مدعوم: {file_extension}")
                 self.analysis_results["status"] = "فشل التحليل"
                 self.analysis_results["error"] = "نوع المستند غير مدعوم"
             # تحديث حالة التحليل
             if self.analysis_results["status"] != "فشل التحليل":
                 self.analysis_results["status"] = "اكتمل التحليل"
                 self.analysis_results["analysis_end_time"] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
             logger.info(f"اكتمل تحليل المستند: {document_path}")
         except Exception as e:
             logger.error(f"خطأ في تحليل المستند: {str(e)}")
             self.analysis_results["status"] = "فشل التحليل"
             self.analysis_results["error"] = str(e)
         finally:
             self.analysis_in_progress = False
             # استدعاء دالة الاستجابة إذا تم توفيرها
             if callback and callable(callback):
                 callback(self.analysis_results)
     def _analyze_pdf(self, document_path, document_type):
         """تحليل مستند PDF باستخدام الذكاء الاصطناعي"""
         try:
             # استخراج النص من PDF
             text = self._extract_text_from_pdf(document_path)
             # تحليل متقدم للمستند
             analysis = {
                 "file_info": {
         try:
             # محاكاة تحليل مستند Word
             logger.info(f"تحليل مستند Word: {document_path}")
             # في التطبيق الفعلي، سيتم استخدام مكتبة مثل python-docx
             # لاستخراج النص من ملف Word وتحليله
             # محاكاة استخراج البنود والكيانات والتواريخ والمبالغ والمخاطر
             # (مشابه لتحليل PDF)
             self.analysis_results["items"] = [
                 {"id": 2, "name": "تركيب المعدات", "description": "تركيب وتشغيل المعدات", "unit": "مجموعة", "estimated_quantity": 10},
                 {"id": 3, "name": "التدريب", "description": "تدريب الموظفين على استخدام المعدات", "unit": "يوم", "estimated_quantity": 20}
             ]
             # محاكاة استخراج الكيانات والتواريخ والمبالغ والمخاطر
             # (مشابه لتحليل PDF)
         except Exception as e:
             logger.error(f"خطأ في تحليل مستند Word: {str(e)}")
             raise
     def _analyze_xlsx(self, document_path, document_type):
         """تحليل مستند Excel"""
         try:
             # محاكاة تحليل مستند Excel
             logger.info(f"تحليل مستند Excel: {document_path}")
             # في التطبيق الفعلي، سيتم استخدام مكتبة مثل pandas أو openpyxl
             # لاستخراج البيانات من ملف Excel وتحليلها
             # محاكاة استخراج البنود
             self.analysis_results["items"] = [
                 {"id": 1, "name": "بند 1", "description": "وصف البند 1", "unit": "وحدة", "estimated_quantity": 100},
                 {"id": 2, "name": "بند 2", "description": "وصف البند 2", "unit": "وحدة", "estimated_quantity": 200},
                 {"id": 3, "name": "بند 3", "description": "وصف البند 3", "unit": "وحدة", "estimated_quantity": 300}
             ]
             # محاكاة استخراج المبالغ
             self.analysis_results["amounts"] = [
                 {"type": "item_cost", "amount": 10000, "currency": "SAR", "description": "تكلفة البند 1"},
                 {"type": "item_cost", "amount": 20000, "currency": "SAR", "description": "تكلفة البند 2"},
                 {"type": "item_cost", "amount": 30000, "currency": "SAR", "description": "تكلفة البند 3"}
             ]
         except Exception as e:
             logger.error(f"خطأ في تحليل مستند Excel: {str(e)}")
             raise
     def _analyze_txt(self, document_path, document_type):
         """تحليل مستند نصي"""
         try:
             # محاكاة تحليل مستند نصي
             logger.info(f"تحليل مستند نصي: {document_path}")
             # في التطبيق الفعلي، سيتم قراءة الملف النصي وتحليله
             # محاكاة استخراج البنود والكيانات والتواريخ والمبالغ والمخاطر
+            # (مشابه لتحليلات أخرى)
         except Exception as e:
             logger.error(f"خطأ في تحليل مستند نصي: {str(e)}")
             raise
     def get_analysis_status(self):
         """الحصول على حالة التحليل الحالي"""
         if not self.analysis_in_progress:
                 return {"status": "لا يوجد تحليل جارٍ"}
             else:
                 return {"status": self.analysis_results.get("status", "غير معروف")}
         return {
             "status": "جاري التحليل",
             "document_path": self.current_document,
             "start_time": self.analysis_results.get("analysis_start_time")
         }
     def get_analysis_results(self):
         """الحصول على نتائج التحليل"""
         return self.analysis_results
     def export_analysis_results(self, output_path=None):
         """تصدير نتائج التحليل إلى ملف JSON"""
         if not self.analysis_results:
             logger.warning("لا توجد نتائج تحليل للتصدير")
             return None
         if not output_path:
             # إنشاء اسم ملف افتراضي
             timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
             filename = f"analysis_results_{timestamp}.json"
             output_path = os.path.join(self.documents_path, filename)
         try:
             with open(output_path, 'w', encoding='utf-8') as f:
                 json.dump(self.analysis_results, f, ensure_ascii=False, indent=4)
             logger.info(f"تم تصدير نتائج التحليل إلى: {output_path}")
             return output_path
         except Exception as e:
             logger.error(f"خطأ في تصدير نتائج التحليل: {str(e)}")
             return None
     def import_analysis_results(self, input_path):
         """استيراد نتائج التحليل من ملف JSON"""
         if not os.path.exists(input_path):
             logger.error(f"ملف نتائج التحليل غير موجود: {input_path}")
             return False
         try:
             with open(input_path, 'r', encoding='utf-8') as f:
                 self.analysis_results = json.load(f)
             logger.info(f"تم استيراد نتائج التحليل من: {input_path}")
             return True
         except Exception as e:
             logger.error(f"خطأ في استيراد نتائج التحليل: {str(e)}")
             return False
         # البحث عن القيم النقدية
         currency_pattern = r'[\d,]+\.?\d*\s*(?:ريال|دولار|SAR|USD)'
         currencies = re.findall(currency_pattern, text)
         # البحث عن النسب المئوية
         percentage_pattern = r'\d+\.?\d*\s*%'
         percentages = re.findall(percentage_pattern, text)
         return {
             "currencies": currencies,
             "percentages": percentages
         avg_word_length = sum(len(word) for word in words) / len(words)
         sentences = text.split('.')
         avg_sentence_length = len(words) / len(sentences)
         # حساب درجة التعقيد (1-10)
         complexity = min((avg_word_length * 0.5 + avg_sentence_length * 0.2), 10)
         return round(complexity, 2)
             "الغرامات",
             "شروط الدفع"
         ]
         missing = []
         for section in required_sections:
             if section not in text:
                 missing.append(section)
         return missing
     def _find_related_documents(self, document_path):
         directory = os.path.dirname(document_path)
         base_name = os.path.basename(document_path)
         related = []
         for file in os.listdir(directory):
             if file != base_name and file.startswith(base_name.split('_')[0]):
                 related.append(file)
         return related
+    def process_image(self, image_path):
+        from PIL import Image
+        import io
+        # فتح الصورة
+        with Image.open(image_path) as img:
+            # تحويل الصورة إلى RGB إذا كانت RGBA
+            if img.mode == 'RGBA':
+                img = img.convert('RGB')
+            # تحديد الحجم الأقصى وضغط أكبر
+            max_size = (1200, 1200)
+            img.thumbnail(max_size, Image.Resampling.LANCZOS)
+            # ضغط الصورة بجودة أقل للحصول على حجم أصغر
+            buffer = io.BytesIO()
+            img.save(buffer, format='JPEG', quality=60, optimize=True)
+            # التحقق من الحجم والضغط أكثر إذا لزم الأمر
+            if len(buffer.getvalue()) > 5000000:  # 5MB
+                # محاولة ضغط إضافية
+                img.thumbnail((800, 800), Image.Resampling.LANCZOS)
+                buffer = io.BytesIO()
+                img.save(buffer, format='JPEG', quality=40, optimize=True)
+            # تحويل الصورة المضغوطة إلى base64
+            return base64.b64encode(buffer.getvalue()).decode('utf-8')
+    def convert_pdf_to_images(self, pdf_path):
+        """تحويل PDF إلى صور"""
+        try:
+            from pdf2image import convert_from_path
+            images = convert_from_path(pdf_path)
+            return images
+        except Exception as e:
+            logger.error(f"فشل في تحويل ملف PDF إلى صورة: {str(e)}")
+            raise