File size: 3,836 Bytes
a1b31ed
 
 
 
 
 
 
 
 
c6702f5
 
 
 
 
 
 
 
 
a1b31ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8e37ed
a1b31ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8e37ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6702f5
b8e37ed
c6702f5
b8e37ed
 
a1b31ed
b8e37ed
 
a1b31ed
c6702f5
b8e37ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1b31ed
 
c6702f5
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from flask import Flask, render_template
from flask_socketio import SocketIO
from transformers import MultiBeamTextStreamer, AutoTokenizer, AutoModelForCausalLM
import torch
import time
import eventlet

eventlet.monkey_patch()
app = Flask(__name__)
socketio = SocketIO(
    app,
    ping_timeout=60,
    ping_interval=25,
    cors_allowed_origins="*",
    async_mode='eventlet',
    logger=True,
    engineio_logger=True
)

# Initialize model and tokenizer
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype="auto",
    device_map="auto"
)


class WebSocketBeamStreamer(MultiBeamTextStreamer):
    """Custom streamer that sends updates through websockets with adjustable speed"""

    def __init__(self, tokenizer, num_beams, sleep_time=0, skip_prompt=True):
        super().__init__(
            tokenizer,
            num_beams=num_beams,
            skip_prompt=skip_prompt,
            on_beam_update=self.on_beam_update,
            on_beam_finished=self.on_beam_finished
        )
        self.beam_texts = {i: "" for i in range(num_beams)}
        self.sleep_time = sleep_time  # Sleep time in milliseconds

    def on_beam_update(self, beam_idx: int, new_text: str):
        """Send beam updates through websocket with delay"""
        self.beam_texts[beam_idx] = new_text
        if self.sleep_time > 0:
            eventlet.sleep(self.sleep_time / 1000)  # Convert milliseconds to seconds
        socketio.emit('beam_update', {
            'beam_idx': beam_idx,
            'text': new_text
        })

    def on_beam_finished(self, final_text: str):
        """Send completion notification through websocket"""
        socketio.emit('beam_finished', {
            'text': final_text
        })


@app.route('/')
def index():
    return render_template('index.html')


@socketio.on('generate')
def handle_generation(data):
    def generate_async():
        try:
            app.logger.info("Generation started with data: %s", data)
            socketio.emit('generation_started', callback=lambda: eventlet.sleep(0))

            prompt = data['prompt']
            num_beams = data.get('num_beams', 5)
            max_new_tokens = data.get('max_tokens', 512)
            sleep_time = data.get('sleep_time', 0)

            messages = [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ]

            text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )

            model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

            streamer = WebSocketBeamStreamer(
                tokenizer=tokenizer,
                num_beams=num_beams,
                sleep_time=sleep_time,
                skip_prompt=True
            )

            with torch.no_grad():
                model.generate(
                    **model_inputs,
                    num_beams=num_beams,
                    num_return_sequences=num_beams,
                    max_new_tokens=max_new_tokens,
                    output_scores=True,
                    return_dict_in_generate=True,
                    early_stopping=True,
                    streamer=streamer
                )

        except Exception as e:
            app.logger.error("Generation error: %s", str(e), exc_info=True)
            socketio.emit('generation_error', {'error': str(e)})
        finally:
            socketio.emit('generation_completed')

    eventlet.spawn(generate_async)

if __name__ == '__main__':
    socketio.run(
        app,
        host='0.0.0.0',
        port=7860,
        debug=True,
        use_reloader=False
    )