CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 16

Commit

d313543

verified ·

1 Parent(s): 455d1f0

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -48

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ import re
 import psutil
 import subprocess
 from collections import defaultdict
 # Persistent directory
 persistent_dir = os.getenv("HF_HOME", "/data/hf_cache")
@@ -143,23 +144,18 @@ def consolidate_findings(responses: List[str]) -> str:
 def init_agent():
     print("🔁 Initializing model...")
     log_system_usage("Before Load")
-    agent = TxAgent(
-        model_name="mims-harvard/TxAgent-T1-Llama-3.1-8B",
-        rag_model_name="mims-harvard/ToolRAG-T1-GTE-Qwen2-1.5B",
-        force_finish=True,
-        enable_checker=False,
-        enable_rag=False,
-        enable_finish=False,  # MODIFIED: Disable Finish tool
-        tool_files_dict=None,
-        step_rag_num=0,
-        seed=100,
     )
-    agent.init_model()
     log_system_usage("After Load")
-    print("✅ Agent Ready")
-    return agent
-def create_ui(agent):
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
         chatbot = gr.Chatbot(label="Analysis", height=600, type="messages")
@@ -185,7 +181,7 @@ def create_ui(agent):
             chunk_size = 800
             chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
             chunk_responses = []
-            batch_size = 8  # MODIFIED: Increase for parallelism
             total_chunks = len(chunks)
             prompt_template = """
@@ -199,42 +195,26 @@ Output only oversights under these headings, one point each. No tools, reasoning
 Records:
 {chunk}
 """
             try:
                 for i in range(0, len(chunks), batch_size):
                     batch = chunks[i:i + batch_size]
                     batch_responses = []
-                    log_system_usage(f"Batch {i//batch_size + 1}")  # MODIFIED: Log VRAM
-                    for j, chunk in enumerate(batch):
-                        prompt = prompt_template.format(chunk=chunk)
-                        chunk_response = ""
-                        for output in agent.run_gradio_chat(
-                            message=prompt,
-                            history=[],
-                            temperature=0.1,
-                            max_new_tokens=64,  # MODIFIED: Reduce for speed
-                            max_token=4096,
-                            call_agent=False,
-                            conversation=[],
-                        ):
-                            if output is None:
-                                continue
-                            if isinstance(output, list):
-                                for m in output:
-                                    if hasattr(m, 'content') and m.content:
-                                        cleaned = clean_response(m.content)
-                                        if cleaned:
-                                            chunk_response += cleaned + "\n"
-                            elif isinstance(output, str) and output.strip():
-                                cleaned = clean_response(output)
-                                if cleaned:
-                                    chunk_response += cleaned + "\n"
-                        if chunk_response:
-                            batch_responses.append(chunk_response)
-                        processed = min(i + j + 1, total_chunks)
-                        history[-1]["content"] = f"🔄 Analyzing... ({processed}/{total_chunks} chunks)"
-                        yield history, None
-                    chunk_responses.extend(batch_responses)
                 final_response = consolidate_findings(chunk_responses)
                 history[-1]["content"] = final_response
@@ -257,8 +237,8 @@ Records:
 if __name__ == "__main__":
     print("🚀 Launching app...")
-    agent = init_agent()
-    demo = create_ui(agent)
     demo.queue(api_open=False).launch(
         server_name="0.0.0.0",
         server_port=7860,

 import psutil
 import subprocess
 from collections import defaultdict
+from vllm import LLM, SamplingParams  # MODIFIED: Direct vLLM for batching
 # Persistent directory
 persistent_dir = os.getenv("HF_HOME", "/data/hf_cache")
 def init_agent():
     print("🔁 Initializing model...")
     log_system_usage("Before Load")
+    model = LLM(
+        model="mims-harvard/TxAgent-T1-Llama-3.1-8B",
+        max_model_len=4096,  # MODIFIED: Reduce KV cache
+        enforce_eager=True,
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=8192,
     )
     log_system_usage("After Load")
+    print("✅ Model Ready")
+    return model
+def create_ui(model):
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
         chatbot = gr.Chatbot(label="Analysis", height=600, type="messages")
             chunk_size = 800
             chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
             chunk_responses = []
+            batch_size = 8
             total_chunks = len(chunks)
             prompt_template = """
 Records:
 {chunk}
 """
+            sampling_params = SamplingParams(
+                temperature=0.1,
+                max_tokens=32,  # MODIFIED: Reduce for speed
+                seed=100,
+            )
             try:
                 for i in range(0, len(chunks), batch_size):
                     batch = chunks[i:i + batch_size]
+                    prompts = [prompt_template.format(chunk=chunk) for chunk in batch]
+                    log_system_usage(f"Batch {i//batch_size + 1}")
+                    outputs = model.generate(prompts, sampling_params)  # MODIFIED: Batch inference
                     batch_responses = []
+                    with ThreadPoolExecutor(max_workers=8) as executor:  # MODIFIED: Parallel cleanup
+                        futures = [executor.submit(clean_response, output.outputs[0].text) for output in outputs]
+                        batch_responses.extend(f.result() for f in as_completed(futures))
+                    chunk_responses.extend([r for r in batch_responses if r])
+                    processed = min(i + len(batch), total_chunks)
+                    history[-1]["content"] = f"🔄 Analyzing... ({processed}/{total_chunks} chunks)"
+                    yield history, None
                 final_response = consolidate_findings(chunk_responses)
                 history[-1]["content"] = final_response
 if __name__ == "__main__":
     print("🚀 Launching app...")
+    model = init_agent()
+    demo = create_ui(model)
     demo.queue(api_open=False).launch(
         server_name="0.0.0.0",
         server_port=7860,