CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 11

Commit

6763f7b

verified ·

1 Parent(s): 588868a

Update ui/ui_core.py

Browse files

Files changed (1) hide show

ui/ui_core.py +31 -29

ui/ui_core.py CHANGED Viewed

@@ -5,10 +5,8 @@ import pdfplumber
 import json
 import gradio as gr
 from typing import List
-from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
-from PIL import Image
-import torch
 # ✅ Fix: Add src to Python path
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src")))
@@ -36,24 +34,20 @@ def clean_final_response(text: str) -> str:
         )
     return "".join(panels)
-def use_layoutlmv3_on_image(image_path):
-    processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-large")
-    model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-large")
-    image = Image.open(image_path).convert("RGB")
-    encoding = processor(images=image, return_tensors="pt")
-    with torch.no_grad():
-        outputs = model(**encoding)
-    logits = outputs.logits
-    predicted_class = logits.argmax(-1)
-    tokens = processor.tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])
-    text = " ".join([tokens[i] for i in range(len(tokens)) if predicted_class[0][i] != -100])
-    return json.dumps({"filename": os.path.basename(image_path), "content": text})
 def convert_file_to_json(file_path: str, file_type: str) -> str:
     try:
         if file_type == "csv":
             df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str, skip_blank_lines=False, on_bad_lines="skip")
         elif file_type in ["xls", "xlsx"]:
@@ -62,7 +56,11 @@ def convert_file_to_json(file_path: str, file_type: str) -> str:
             except:
                 df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
         elif file_type == "pdf":
-            return use_layoutlmv3_on_image(file_path)
         else:
             return json.dumps({"error": f"Unsupported file type: {file_type}"})
@@ -71,7 +69,9 @@ def convert_file_to_json(file_path: str, file_type: str) -> str:
         df = df.fillna("")
         content = df.astype(str).values.tolist()
-        return json.dumps({"filename": os.path.basename(file_path), "rows": content})
     except Exception as e:
         return json.dumps({"error": f"Error reading {os.path.basename(file_path)}: {str(e)}"})
@@ -133,13 +133,11 @@ def create_ui(agent: TxAgent):
                 chunks = chunk_text(extracted_text.strip())
-                full_response = ""
-                for i, chunk in enumerate(chunks):
                     chunked_prompt = (
                         f"{context}\n\n--- Uploaded File Content (Chunk {i+1}/{len(chunks)}) ---\n\n{chunk}\n\n"
                         f"--- End of Chunk ---\n\nNow begin your analysis:"
                     )
                     generator = agent.run_gradio_chat(
                         message=chunked_prompt,
                         history=[],
@@ -151,18 +149,22 @@ def create_ui(agent: TxAgent):
                         uploaded_files=uploaded_files,
                         max_round=30
                     )
-                    chunk_response = ""
                     for update in generator:
                         if isinstance(update, str):
-                            chunk_response += update
                         elif isinstance(update, list):
                             for msg in update:
                                 if hasattr(msg, 'content'):
-                                    chunk_response += msg.content
-                    full_response += chunk_response + "\n\n"
                 full_response = clean_final_response(full_response.strip())
                 history[-1] = (message, full_response)
                 yield history

 import json
 import gradio as gr
 from typing import List
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import hashlib
 # ✅ Fix: Add src to Python path
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src")))
         )
     return "".join(panels)
+def file_hash(path):
+    with open(path, "rb") as f:
+        return hashlib.md5(f.read()).hexdigest()
 def convert_file_to_json(file_path: str, file_type: str) -> str:
     try:
+        cache_dir = os.path.join("cache")
+        os.makedirs(cache_dir, exist_ok=True)
+        h = file_hash(file_path)
+        cache_path = os.path.join(cache_dir, f"{h}.json")
+        if os.path.exists(cache_path):
+            return open(cache_path, "r", encoding="utf-8").read()
         if file_type == "csv":
             df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str, skip_blank_lines=False, on_bad_lines="skip")
         elif file_type in ["xls", "xlsx"]:
             except:
                 df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
         elif file_type == "pdf":
+            with pdfplumber.open(file_path) as pdf:
+                text = "\n".join([page.extract_text() or "" for page in pdf.pages])
+            result = json.dumps({"filename": os.path.basename(file_path), "content": text.strip()})
+            open(cache_path, "w", encoding="utf-8").write(result)
+            return result
         else:
             return json.dumps({"error": f"Unsupported file type: {file_type}"})
         df = df.fillna("")
         content = df.astype(str).values.tolist()
+        result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
+        open(cache_path, "w", encoding="utf-8").write(result)
+        return result
     except Exception as e:
         return json.dumps({"error": f"Error reading {os.path.basename(file_path)}: {str(e)}"})
                 chunks = chunk_text(extracted_text.strip())
+                def process_chunk(i, chunk):
                     chunked_prompt = (
                         f"{context}\n\n--- Uploaded File Content (Chunk {i+1}/{len(chunks)}) ---\n\n{chunk}\n\n"
                         f"--- End of Chunk ---\n\nNow begin your analysis:"
                     )
                     generator = agent.run_gradio_chat(
                         message=chunked_prompt,
                         history=[],
                         uploaded_files=uploaded_files,
                         max_round=30
                     )
+                    result = ""
                     for update in generator:
                         if isinstance(update, str):
+                            result += update
                         elif isinstance(update, list):
                             for msg in update:
                                 if hasattr(msg, 'content'):
+                                    result += msg.content
+                    return result
+                # ⏱ Parallel Execution for Speed
+                with ThreadPoolExecutor(max_workers=min(8, len(chunks))) as executor:
+                    futures = [executor.submit(process_chunk, i, chunk) for i, chunk in enumerate(chunks)]
+                    results = [f.result() for f in as_completed(futures)]
+                full_response = "\n\n".join(results)
                 full_response = clean_final_response(full_response.strip())
                 history[-1] = (message, full_response)
                 yield history