Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from flores import code_mapping | |
| from functools import lru_cache | |
| import openai # 用于调用外部API | |
| import os | |
| import spaces | |
| import gradio as gr | |
| from sacremoses import MosesPunctNormalizer | |
| from stopes.pipelines.monolingual.utils.sentence_split import get_split_algo | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| from flores import code_mapping | |
| import platform | |
| import torch | |
| import nltk | |
| from functools import lru_cache | |
| print(os.getenv('key')) | |
| code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[0])) | |
| flores_codes = list(code_mapping.keys()) | |
| target_languages = flores_codes # 简化列表 | |
| # 假设openai_client已定义,例如: | |
| device = "cpu" if platform.system() == "Darwin" else "cuda" | |
| MODEL_NAME = "ByteDance-Seed/Seed-X-PPO-7B" | |
| code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[0])) | |
| flores_codes = list(code_mapping.keys()) | |
| target_languages = [language for language in flores_codes if not language in REMOVED_TARGET_LANGUAGES] | |
| def load_model(): | |
| model = AutoModel.from_pretrained(MODEL_NAME).to(device) | |
| print(f"Model loaded in {device}") | |
| return model | |
| model = load_model() | |
| # Loading the tokenizer once, because re-loading it takes about 1.5 seconds each time | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| def translate(text: str, src_lang: str, tgt_lang: str): | |
| if not src_lang: | |
| raise gr.Error("The source language is empty! Please choose it in the dropdown list.") | |
| if not tgt_lang: | |
| raise gr.Error("The target language is empty! Please choose it in the dropdown list.") | |
| return _translate(text, src_lang, tgt_lang) | |
| # Only assign GPU if cache not used | |
| def _translate(text: str, src_lang: str, tgt_lang: str): | |
| src_code = code_mapping[src_lang] | |
| tgt_code = code_mapping[tgt_lang] | |
| tokenizer.src_lang = src_code | |
| tokenizer.tgt_lang = tgt_code | |
| # normalizing the punctuation first | |
| text = punct_normalizer.normalize(text) | |
| paragraphs = text.split("\n") | |
| translated_paragraphs = [] | |
| for paragraph in paragraphs: | |
| splitter = get_language_specific_sentence_splitter(src_code) | |
| sentences = list(splitter(paragraph)) | |
| translated_sentences = [] | |
| for sentence in sentences: | |
| input_tokens = ( | |
| tokenizer(sentence, return_tensors="pt") | |
| .input_ids[0] | |
| .cpu() | |
| .numpy() | |
| .tolist() | |
| ) | |
| translated_chunk = model.generate( | |
| input_ids=torch.tensor([input_tokens]).to(device), | |
| forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code), | |
| max_length=len(input_tokens) + 50, | |
| num_return_sequences=1, | |
| num_beams=5, | |
| no_repeat_ngram_size=4, # repetition blocking works better if this number is below num_beams | |
| renormalize_logits=True, # recompute token probabilities after banning the repetitions | |
| ) | |
| translated_chunk = tokenizer.decode( | |
| translated_chunk[0], skip_special_tokens=True | |
| ) | |
| translated_sentences.append(translated_chunk) | |
| translated_paragraph = " ".join(translated_sentences) | |
| translated_paragraphs.append(translated_paragraph) | |
| return "\n".join(translated_paragraphs) | |
| # def _translate(text: str, src_lang: str, tgt_lang: str): | |
| # prompt = f"Translate the following text from {src_lang} to {tgt_lang}. Direct output translation result without any explaination:\n\n{text}" | |
| # key=os.getenv('key') | |
| # openai_client = openai.OpenAI(base_url="https://ssapi.cppbear.site/v1", api_key=key) | |
| # response = openai_client.chat.completions.create( | |
| # model="tbai.xin-dpsk-deepseek-v3", # 如gpt-3.5-turbo或其他兼容模型 | |
| # messages=[{"role": "user", "content": prompt}], | |
| # max_tokens=30240, | |
| # temperature=0.0 | |
| # ) | |
| # print(response) | |
| # return response.choices[0].message.content.strip() | |
| description = """ | |
| <div style="text-align: center;"> | |
| <img src="https://github.com/user-attachments/assets/c42e675e-497c-4508-8bb9-093ad4d1f216" alt="UNESCO Meta Hugging Face Banner" style="max-width: 800px; width: 100%; margin: 0 auto;"> | |
| <h1 style="color: #0077be; font-size: 3em;">Seed-X, powered by Bytedance</h1> | |
| </div> | |
| We are excited to introduce Seed-X, a powerful series of open-source multilingual translation language models, including an instruction model, a reinforcement learning model, and a reward model. It pushes the boundaries of translation capabilities within 7 billion parameters. We develop Seed-X as an accessible, off-the-shelf tool to support the community in advancing translation research and applications: | |
| """ | |
| examples_inputs = [["Seed-X is indeed a good translation model ","English","Chinese"],] | |
| with gr.Blocks() as demo: | |
| gr.Markdown(description) | |
| with gr.Row(): | |
| src_lang = gr.Dropdown(label="Source Language", choices=flores_codes) | |
| target_lang = gr.Dropdown(label="Target Language", choices=target_languages) | |
| with gr.Row(): | |
| input_text = gr.Textbox(label="Input Text", lines=6) | |
| with gr.Row(): | |
| btn = gr.Button("Translate text") | |
| with gr.Row(): | |
| output = gr.Textbox(label="Output Text", lines=6) | |
| btn.click( | |
| translate, | |
| inputs=[input_text, src_lang, target_lang], | |
| outputs=output, | |
| ) | |
| examples = gr.Examples(examples=examples_inputs,inputs=[input_text, src_lang,target_lang], fn=translate, outputs=output, cache_examples=True) | |
| demo.launch() | |