File size: 5,195 Bytes
a99c8d6
 
 
5ff3da0
a482122
5ff3da0
a482122
3331037
a482122
 
5ff3da0
a482122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ff3da0
a482122
 
 
 
5ff3da0
a482122
 
 
 
 
5ff3da0
a482122
 
 
 
 
 
 
5ff3da0
d319fa6
 
a482122
d319fa6
 
 
 
 
 
5ff3da0
d319fa6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a482122
 
 
d319fa6
 
a482122
d319fa6
 
 
 
 
 
 
 
 
 
 
 
 
 
5ff3da0
 
e400a0b
d319fa6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import base64
import os

import gradio as gr
from openai import OpenAI

client = OpenAI(
    api_key=os.getenv('HUNYUAN_API_KEY'),
    base_url="https://api.hunyuan.cloud.tencent.com/v1"
)

def generate_caption(image_path, question):
    # 将图片转换为Base64
    with open(image_path, "rb") as image_file:
        base64_image = base64.b64encode(image_file.read()).decode('utf-8')
    
    # 构建消息结构
    messages = [{
        "role": "user",
        "content": [
            {"type": "text", "text": question},
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{base64_image}"
                }
            }
        ]
    }]

    # 调用混元视觉模型
    response = client.chat.completions.create(
        model="hunyuan-vision",
        messages=messages,
        stream=True,
        extra_body={
            "stream_moderation": True,
            "enable_enhancement": False
        }
    )

    # 流式处理响应
    full_response = ""
    for chunk in response:
        token = chunk.choices[0].delta.content
        if token:
            full_response += token
            yield full_response



# 创建Gradio界面
title = "Hunyuan-Vision图生文Demo"
theme = gr.themes.Soft(
    primary_hue="teal",
    secondary_hue="blue",
    font=[gr.themes.GoogleFont("Noto Sans SC"), "Arial", "sans-serif"]
)

with gr.Blocks(title=title, theme=theme) as demo:
    # ================= 头部区域 =================
    gr.Markdown(f"""
    <div style="text-align: center;">
        <h1 style="color: #2E86C1; border-bottom: 3px solid #AED6F1; padding-bottom: 10px;">🖼️ {title}</h1>
        <p style="color: #616A6B;">上传图片并输入问题,体验腾讯混元视觉大模型的图像理解能力</p>
    </div>
    """)
    
    # ================= 主体区域 =================
    with gr.Row(variant="panel"):
        # 左侧输入列
        with gr.Column(scale=3):
            with gr.Group(label="输入区域"):
                image_input = gr.Image(
                    type="filepath",
                    label="上传图片",
                    height=400,
                    show_download_button=False,
                    elem_classes="preview-box"
                )
                question_input = gr.Textbox(
                    label="问题描述",
                    placeholder="请输入关于图片的问题...",
                    value="请详细描述图片中的场景、人物和细节",
                    lines=2
                )
                with gr.Row():
                    clear_btn = gr.Button("清空", variant="secondary")
                    submit_btn = gr.Button("生成描述", variant="primary")
        
        # 右侧输出列
        with gr.Column(scale=4):
            with gr.Group(label="生成结果"):
                output = gr.Textbox(
                    label="描述内容",
                    interactive=False,
                    show_copy_button=True,
                    lines=12,
                    max_lines=20,
                    autoscroll=True
                )
    
    # ================= 示例区域 =================
    with gr.Accordion("🖼️ 点击查看示例", open=False):
        with gr.Row():
            gr.Examples(
                examples=[
                    ["tencent.png", "图片中的天气状况如何?"],
                    ["tencent.png", "描述参会人员的衣着特征"]
                ],
                inputs=[image_input, question_input],
                label="快速示例"
            )
    
    # ================= 交互逻辑 =================
    submit_btn.click(
        fn=generate_caption,
        inputs=[image_input, question_input],
        outputs=output,
        api_name="generate"
    )
    
    clear_btn.click(
        fn=lambda: [None, "", ""],
        outputs=[image_input, question_input, output],
        queue=False
    )

# ================= 自定义样式 =================
css = """
.preview-box img {border-radius: 10px; box-shadow: 0 4px 6px rgba(0,0,0,0.1);}
.preview-box:hover img {transform: scale(1.02);}
button#generate {transition: all 0.3s ease;}
"""
demo.css = css

if __name__ == "__main__":
    demo.queue(default_concurrency_limit=100)
    demo.launch(
        server_port=7860,
        show_error=True,
        favicon_path="favicon.ico",
        max_threads=100
    )

# # 创建Gradio界面
# title="Hunyuan-Vision图生文Demo"
# with gr.Blocks(title=title) as demo:
#     gr.Markdown(f"# 🖼️ {title}")
#     with gr.Row():
#         with gr.Column():
#             image_input = gr.Image(type="filepath", label="上传图片")
#             question_input = gr.Textbox(label="输入问题", value="请描述图片内容")
#             submit_btn = gr.Button("生成描述")
#         output = gr.Textbox(label="描述结果", interactive=False)

#     submit_btn.click(
#         fn=generate_caption,
#         inputs=[image_input, question_input],
#         outputs=output
#     )

# if __name__ == "__main__":
#     demo.queue(default_concurrency_limit=100)
#     demo.launch(max_threads=100)