Moshe Ofer commited on
Commit
c6702f5
·
1 Parent(s): a5ab062

Initial commit for Hugging Face Space

Browse files
Files changed (2) hide show
  1. Dockerfile +9 -12
  2. app.py +51 -36
Dockerfile CHANGED
@@ -1,30 +1,27 @@
1
- # Use a lightweight Python base image
2
  FROM python:3.9-slim
3
 
4
- # Set the working directory in the container
5
  WORKDIR /app
6
 
7
- # Install system dependencies required for the application
8
  RUN apt-get update && apt-get install -y --no-install-recommends \
9
  build-essential \
10
  git && \
11
  rm -rf /var/lib/apt/lists/*
12
 
13
- # Create a writable cache directory
14
  RUN mkdir -p /app/cache && chmod -R 777 /app/cache
 
15
 
16
- # Set the environment variable for Hugging Face Transformers cache
17
- ENV TRANSFORMERS_CACHE=/app/cache
18
-
19
- # Copy the application files into the container
20
  COPY . /app
21
 
22
- # Install Python dependencies
23
  RUN pip install --no-cache-dir --upgrade pip
24
  RUN pip install --no-cache-dir -r requirements.txt
25
 
26
- # Expose the application port
27
  EXPOSE 7860
28
 
29
- # Command to run the application using Gunicorn with Eventlet
30
- CMD ["gunicorn", "--worker-class", "eventlet", "-w", "1", "app:app", "-b", "0.0.0.0:7860"]
 
 
 
 
 
 
 
 
1
  FROM python:3.9-slim
2
 
 
3
  WORKDIR /app
4
 
 
5
  RUN apt-get update && apt-get install -y --no-install-recommends \
6
  build-essential \
7
  git && \
8
  rm -rf /var/lib/apt/lists/*
9
 
 
10
  RUN mkdir -p /app/cache && chmod -R 777 /app/cache
11
+ ENV HF_HOME=/app/cache
12
 
 
 
 
 
13
  COPY . /app
14
 
 
15
  RUN pip install --no-cache-dir --upgrade pip
16
  RUN pip install --no-cache-dir -r requirements.txt
17
 
 
18
  EXPOSE 7860
19
 
20
+ CMD ["gunicorn", \
21
+ "--worker-class", "eventlet", \
22
+ "--workers", "1", \
23
+ "--timeout", "300", \
24
+ "--keep-alive", "120", \
25
+ "--log-level", "debug", \
26
+ "--bind", "0.0.0.0:7860", \
27
+ "app:app"]
app.py CHANGED
@@ -7,7 +7,15 @@ import eventlet
7
 
8
  eventlet.monkey_patch()
9
  app = Flask(__name__)
10
- socketio = SocketIO(app, ping_timeout=60)
 
 
 
 
 
 
 
 
11
 
12
  # Initialize model and tokenizer
13
  MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
@@ -57,40 +65,38 @@ def index():
57
 
58
  @socketio.on('generate')
59
  def handle_generation(data):
60
- # Emit a generation start event
61
- socketio.emit('generation_started')
62
-
63
- prompt = data['prompt']
64
- num_beams = data.get('num_beams', 5)
65
- max_new_tokens = data.get('max_tokens', 512)
66
- sleep_time = data.get('sleep_time', 0) # Get sleep time from frontend
67
-
68
- # Create messages format
69
- messages = [
70
- {"role": "system", "content": "You are a helpful assistant."},
71
- {"role": "user", "content": prompt}
72
- ]
73
-
74
- # Apply chat template
75
- text = tokenizer.apply_chat_template(
76
- messages,
77
- tokenize=False,
78
- add_generation_prompt=True
79
- )
80
-
81
- # Prepare inputs
82
- model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
83
-
84
- # Initialize streamer with sleep time
85
- streamer = WebSocketBeamStreamer(
86
- tokenizer=tokenizer,
87
- num_beams=num_beams,
88
- sleep_time=sleep_time,
89
- skip_prompt=True
90
- )
91
-
92
  try:
93
- # Generate with beam search
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  with torch.no_grad():
95
  model.generate(
96
  **model_inputs,
@@ -102,12 +108,21 @@ def handle_generation(data):
102
  early_stopping=True,
103
  streamer=streamer
104
  )
 
 
 
105
  except Exception as e:
 
106
  socketio.emit('generation_error', {'error': str(e)})
107
  finally:
108
- # Emit generation completed event
109
  socketio.emit('generation_completed')
110
 
111
 
112
  if __name__ == '__main__':
113
- socketio.run(app, host='0.0.0.0', port=7860)
 
 
 
 
 
 
 
7
 
8
  eventlet.monkey_patch()
9
  app = Flask(__name__)
10
+ socketio = SocketIO(
11
+ app,
12
+ ping_timeout=60,
13
+ ping_interval=25,
14
+ cors_allowed_origins="*",
15
+ async_mode='eventlet',
16
+ logger=True,
17
+ engineio_logger=True
18
+ )
19
 
20
  # Initialize model and tokenizer
21
  MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
 
65
 
66
  @socketio.on('generate')
67
  def handle_generation(data):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  try:
69
+ app.logger.info("Generation started with data: %s", data)
70
+ socketio.emit('generation_started')
71
+
72
+ prompt = data['prompt']
73
+ num_beams = data.get('num_beams', 5)
74
+ max_new_tokens = data.get('max_tokens', 512)
75
+ sleep_time = data.get('sleep_time', 0)
76
+
77
+ app.logger.info("Processing with parameters: beams=%d, max_tokens=%d",
78
+ num_beams, max_new_tokens)
79
+
80
+ messages = [
81
+ {"role": "system", "content": "You are a helpful assistant."},
82
+ {"role": "user", "content": prompt}
83
+ ]
84
+
85
+ text = tokenizer.apply_chat_template(
86
+ messages,
87
+ tokenize=False,
88
+ add_generation_prompt=True
89
+ )
90
+
91
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
92
+
93
+ streamer = WebSocketBeamStreamer(
94
+ tokenizer=tokenizer,
95
+ num_beams=num_beams,
96
+ sleep_time=sleep_time,
97
+ skip_prompt=True
98
+ )
99
+
100
  with torch.no_grad():
101
  model.generate(
102
  **model_inputs,
 
108
  early_stopping=True,
109
  streamer=streamer
110
  )
111
+
112
+ app.logger.info("Generation completed successfully")
113
+
114
  except Exception as e:
115
+ app.logger.error("Generation error: %s", str(e), exc_info=True)
116
  socketio.emit('generation_error', {'error': str(e)})
117
  finally:
 
118
  socketio.emit('generation_completed')
119
 
120
 
121
  if __name__ == '__main__':
122
+ socketio.run(
123
+ app,
124
+ host='0.0.0.0',
125
+ port=7860,
126
+ debug=True,
127
+ use_reloader=False
128
+ )