DeepSeekR1-LIVE

Paused

App Files Files Community

sagar007 commited on Jan 26

Commit

8a9a6c3

verified ·

1 Parent(s): f4f3cd0

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -60

app.py CHANGED Viewed

@@ -1,32 +1,49 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import spaces
 from duckduckgo_search import DDGS
 import time
 import torch
 from datetime import datetime
-# Initialize model and tokenizer
 model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
 tokenizer.pad_token = tokenizer.eos_token
-# Modified model loading for CPU
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
-    device_map="cpu",  # Changed to CPU
     low_cpu_mem_usage=True,
-    torch_dtype=torch.float32  # Changed to float32 for CPU
 )
-def get_web_results(query, max_results=5):  # Increased to 5 for better context
     """Get web search results using DuckDuckGo"""
     try:
         with DDGS() as ddgs:
             results = list(ddgs.text(query, max_results=max_results))
             return [{
                 "title": result.get("title", ""),
-                "snippet": result["body"],
                 "url": result["href"],
                 "date": result.get("published", "")
             } for result in results]
@@ -34,19 +51,10 @@ def get_web_results(query, max_results=5):  # Increased to 5 for better context
         return []
 def format_prompt(query, context):
-    """Format the prompt with web context"""
-    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    context_lines = '\n'.join([f'- [{res["title"]}]: {res["snippet"]}' for res in context])
-    return f"""You are an intelligent search assistant. Answer the user's query using the provided web context.
-Current Time: {current_time}
-Query: {query}
-Web Context:
-{context_lines}
-Provide a detailed answer in markdown format. Include relevant information from sources and cite them using [1], [2], etc.
-Answer:"""
 def format_sources(web_results):
     """Format sources with more details"""
@@ -71,69 +79,81 @@ def format_sources(web_results):
     return sources_html
 def generate_answer(prompt):
-    """Generate answer using the DeepSeek model"""
-    inputs = tokenizer(
-        prompt,
-        return_tensors="pt",
-        padding=True,
-        truncation=True,
-        max_length=256,  # Reduced max length for CPU
-        return_attention_mask=True
-    )  # Removed .to(model.device) since we're using CPU
-    outputs = model.generate(
-        inputs.input_ids,
-        attention_mask=inputs.attention_mask,
-        max_new_tokens=128,  # Reduced for faster generation on CPU
-        temperature=0.7,
-        top_p=0.95,
-        pad_token_id=tokenizer.eos_token_id,
-        do_sample=True,
-        early_stopping=True,
-        num_beams=1  # Reduced beam search for faster generation
-    )
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)
 def process_query(query, history):
-    """Process user query with streaming effect"""
     try:
         if history is None:
             history = []
         # Get web results first
         web_results = get_web_results(query)
         sources_html = format_sources(web_results)
-        current_history = history + [[query, "*Searching...*"]]
         yield {
-            answer_output: gr.Markdown("*Searching the web...*"),
             sources_output: gr.HTML(sources_html),
-            search_btn: gr.Button("Searching...", interactive=False),
-            chat_history_display: current_history
         }
-        # Generate answer
         prompt = format_prompt(query, web_results)
         answer = generate_answer(prompt)
-        final_answer = answer.split("Answer:")[-1].strip()
-        updated_history = history + [[query, final_answer]]
         yield {
-            answer_output: gr.Markdown(final_answer),
             sources_output: gr.HTML(sources_html),
             search_btn: gr.Button("Search", interactive=True),
-            chat_history_display: updated_history
         }
-    except Exception as e:
-        error_message = str(e)
-        if "GPU quota" in error_message:
-            error_message = "⚠️ GPU quota exceeded. Please try again later when the daily quota resets."
         yield {
-            answer_output: gr.Markdown(f"Error: {error_message}"),
-            sources_output: gr.HTML(sources_html),
             search_btn: gr.Button("Search", interactive=True),
-            chat_history_display: history + [[query, f"*Error: {error_message}*"]]
         }
 # Update the CSS for better contrast and readability

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 import spaces
 from duckduckgo_search import DDGS
 import time
 import torch
 from datetime import datetime
+import gc  # For manual garbage collection
+# Initialize model and tokenizer with optimizations
 model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+# Load config first to set optimal parameters
+config = AutoConfig.from_pretrained(model_name)
+config.use_cache = True  # Enable KV-caching for faster inference
+# Initialize tokenizer with optimizations
+tokenizer = AutoTokenizer.from_pretrained(
+    model_name,
+    model_max_length=256,  # Reduced for faster processing
+    padding_side="left",
+    truncation_side="left",
+)
 tokenizer.pad_token = tokenizer.eos_token
+# Load model with optimizations
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
+    config=config,
+    device_map="cpu",
     low_cpu_mem_usage=True,
+    torch_dtype=torch.float32,
 )
+# Enable model optimizations
+model.eval()  # Set to evaluation mode
+torch.set_num_threads(4)  # Limit CPU threads for better performance
+def get_web_results(query, max_results=3):  # Reduced max results
     """Get web search results using DuckDuckGo"""
     try:
         with DDGS() as ddgs:
             results = list(ddgs.text(query, max_results=max_results))
             return [{
                 "title": result.get("title", ""),
+                "snippet": result["body"][:200],  # Limit snippet length
                 "url": result["href"],
                 "date": result.get("published", "")
             } for result in results]
         return []
 def format_prompt(query, context):
+    """Format the prompt with web context - optimized version"""
+    context_lines = '\n'.join([f'[{i+1}] {res["snippet"]}'
+                              for i, res in enumerate(context)])
+    return f"""Answer this query using the context: {query}\n\nContext:\n{context_lines}\n\nAnswer:"""
 def format_sources(web_results):
     """Format sources with more details"""
     return sources_html
 def generate_answer(prompt):
+    """Generate answer using the DeepSeek model - optimized version"""
+    try:
+        # Clear CUDA cache and garbage collect
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        gc.collect()
+        inputs = tokenizer(
+            prompt,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=256,
+            return_attention_mask=True
+        )
+        with torch.no_grad():  # Disable gradient calculation
+            outputs = model.generate(
+                inputs.input_ids,
+                attention_mask=inputs.attention_mask,
+                max_new_tokens=100,  # Further reduced for speed
+                temperature=0.7,
+                top_p=0.95,
+                pad_token_id=tokenizer.eos_token_id,
+                do_sample=True,
+                num_beams=1,
+                early_stopping=True,
+                no_repeat_ngram_size=3,
+                length_penalty=1.0
+            )
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return response.split('Answer:')[-1].strip()
+    except Exception as e:
+        return f"Error generating response: {str(e)}"
 def process_query(query, history):
+    """Process user query with optimized streaming effect"""
     try:
         if history is None:
             history = []
         # Get web results first
         web_results = get_web_results(query)
         sources_html = format_sources(web_results)
+        # Show searching status
         yield {
+            answer_output: gr.Markdown("*Searching and generating response...*"),
             sources_output: gr.HTML(sources_html),
+            search_btn: gr.Button("Please wait...", interactive=False),
+            chat_history_display: history + [[query, "*Processing...*"]]
         }
+        # Generate answer with timeout protection
         prompt = format_prompt(query, web_results)
         answer = generate_answer(prompt)
+        # Update with final answer
+        final_history = history + [[query, answer]]
         yield {
+            answer_output: gr.Markdown(answer),
             sources_output: gr.HTML(sources_html),
             search_btn: gr.Button("Search", interactive=True),
+            chat_history_display: final_history
         }
+    except Exception as e:
+        error_msg = f"Error: {str(e)}"
         yield {
+            answer_output: gr.Markdown(error_msg),
+            sources_output: gr.HTML("<div>Error fetching sources</div>"),
             search_btn: gr.Button("Search", interactive=True),
+            chat_history_display: history + [[query, error_msg]]
         }
 # Update the CSS for better contrast and readability