Spaces:

mosheofer1
/

multi_beam_text_streamer

Sleeping

App Files Files Community

Moshe Ofer commited on Dec 26, 2024

Commit

c6702f5

1 Parent(s): a5ab062

Initial commit for Hugging Face Space

Browse files

Files changed (2) hide show

Dockerfile +9 -12
app.py +51 -36

Dockerfile CHANGED Viewed

@@ -1,30 +1,27 @@
-# Use a lightweight Python base image
 FROM python:3.9-slim
-# Set the working directory in the container
 WORKDIR /app
-# Install system dependencies required for the application
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     git && \
     rm -rf /var/lib/apt/lists/*
-# Create a writable cache directory
 RUN mkdir -p /app/cache && chmod -R 777 /app/cache
-# Set the environment variable for Hugging Face Transformers cache
-ENV TRANSFORMERS_CACHE=/app/cache
-# Copy the application files into the container
 COPY . /app
-# Install Python dependencies
 RUN pip install --no-cache-dir --upgrade pip
 RUN pip install --no-cache-dir -r requirements.txt
-# Expose the application port
 EXPOSE 7860
-# Command to run the application using Gunicorn with Eventlet
-CMD ["gunicorn", "--worker-class", "eventlet", "-w", "1", "app:app", "-b", "0.0.0.0:7860"]

 FROM python:3.9-slim
 WORKDIR /app
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     git && \
     rm -rf /var/lib/apt/lists/*
 RUN mkdir -p /app/cache && chmod -R 777 /app/cache
+ENV HF_HOME=/app/cache
 COPY . /app
 RUN pip install --no-cache-dir --upgrade pip
 RUN pip install --no-cache-dir -r requirements.txt
 EXPOSE 7860
+CMD ["gunicorn", \
+     "--worker-class", "eventlet", \
+     "--workers", "1", \
+     "--timeout", "300", \
+     "--keep-alive", "120", \
+     "--log-level", "debug", \
+     "--bind", "0.0.0.0:7860", \
+     "app:app"]

app.py CHANGED Viewed

@@ -7,7 +7,15 @@ import eventlet
 eventlet.monkey_patch()
 app = Flask(__name__)
-socketio = SocketIO(app, ping_timeout=60)
 # Initialize model and tokenizer
 MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
@@ -57,40 +65,38 @@ def index():
 @socketio.on('generate')
 def handle_generation(data):
-    # Emit a generation start event
-    socketio.emit('generation_started')
-    prompt = data['prompt']
-    num_beams = data.get('num_beams', 5)
-    max_new_tokens = data.get('max_tokens', 512)
-    sleep_time = data.get('sleep_time', 0)  # Get sleep time from frontend
-    # Create messages format
-    messages = [
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": prompt}
-    ]
-    # Apply chat template
-    text = tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True
-    )
-    # Prepare inputs
-    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-    # Initialize streamer with sleep time
-    streamer = WebSocketBeamStreamer(
-        tokenizer=tokenizer,
-        num_beams=num_beams,
-        sleep_time=sleep_time,
-        skip_prompt=True
-    )
     try:
-        # Generate with beam search
         with torch.no_grad():
             model.generate(
                 **model_inputs,
@@ -102,12 +108,21 @@ def handle_generation(data):
                 early_stopping=True,
                 streamer=streamer
             )
     except Exception as e:
         socketio.emit('generation_error', {'error': str(e)})
     finally:
-        # Emit generation completed event
         socketio.emit('generation_completed')
 if __name__ == '__main__':
-    socketio.run(app, host='0.0.0.0', port=7860)

 eventlet.monkey_patch()
 app = Flask(__name__)
+socketio = SocketIO(
+    app,
+    ping_timeout=60,
+    ping_interval=25,
+    cors_allowed_origins="*",
+    async_mode='eventlet',
+    logger=True,
+    engineio_logger=True
+)
 # Initialize model and tokenizer
 MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
 @socketio.on('generate')
 def handle_generation(data):
     try:
+        app.logger.info("Generation started with data: %s", data)
+        socketio.emit('generation_started')
+        prompt = data['prompt']
+        num_beams = data.get('num_beams', 5)
+        max_new_tokens = data.get('max_tokens', 512)
+        sleep_time = data.get('sleep_time', 0)
+        app.logger.info("Processing with parameters: beams=%d, max_tokens=%d",
+                        num_beams, max_new_tokens)
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ]
+        text = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+        streamer = WebSocketBeamStreamer(
+            tokenizer=tokenizer,
+            num_beams=num_beams,
+            sleep_time=sleep_time,
+            skip_prompt=True
+        )
         with torch.no_grad():
             model.generate(
                 **model_inputs,
                 early_stopping=True,
                 streamer=streamer
             )
+        app.logger.info("Generation completed successfully")
     except Exception as e:
+        app.logger.error("Generation error: %s", str(e), exc_info=True)
         socketio.emit('generation_error', {'error': str(e)})
     finally:
         socketio.emit('generation_completed')
 if __name__ == '__main__':
+    socketio.run(
+        app,
+        host='0.0.0.0',
+        port=7860,
+        debug=True,
+        use_reloader=False
+    )