Spaces:

dimasdeffieux
/

explain_lang

Sleeping

File size: 3,710 Bytes

import requests
from paddleocr import PaddleOCR, draw_ocr
from PIL import Image
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

img = "input_data/ocr_input/korean1.jpg"
text = "표현이 서툰 것도 잘못인가요. 나 차가운 도시에 따뜻한 여잔데. 그냥 좋아한단 말도 안 되는가요. 솔직하게 난 말하고 싶어요"
model_id = "deepseek-ai/deepseek-llm-7b-chat"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)

def text_inference(text, language):
    system_prompt = (
        f"Given the following {language} text, convert each word into their base form. Remove all duplicates. Return the base form words as a comma-separated list, and nothing else."
    )
    user_prompt = f"{system_prompt}\n\nText:\n{text}"

    input_ids = tokenizer.apply_chat_template([{"role": "user", "content": user_prompt}], return_tensors="pt").to(model.device)
    output_ids = model.generate(input_ids, max_new_tokens=256)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

     # Parse response: take last line, split by commas
    last_line = output_text.strip().split("\n")[-1]
    words = [w.strip() for w in last_line.split(",") if w.strip()]
    return words

def ocr_inference(img, lang):
	ocr = PaddleOCR(use_angle_cls=True, lang=lang,use_gpu=False)
	img_path = img  
	result = ocr.ocr(img_path, cls=True)[0]
	image = Image.open(img_path).convert('RGB')
	boxes = [line[0] for line in result]
	txts = [line[1][0] for line in result]
	scores = [line[1][1] for line in result]
	return txts

def make_flashcards(words, language):

    system_prompt = (
        f"for each {language} word in the list, write a flashcard in this format: the word, then its definition, then an example sentence using the word, and then a translation of example sentence"
    )
    user_prompt = f"{system_prompt}\n\nWords:\n{words}"

    input_ids = tokenizer.apply_chat_template([{"role": "user", "content": user_prompt}], return_tensors="pt").to(model.device)
    output_ids = model.generate(input_ids, max_new_tokens=256)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Parse response: take last line, split by commas
    last_line = output_text.strip().split("\n")[-1]
    output = [w.strip() for w in last_line.split(":") if w.strip()]
    return output

# words=text_inference(text, "korean")
# print("OUTPUT TOUT OUETOI EIFJ IEFJ",words)
# print("flashcard output:",make_flashcards(words, "korean"))

# print("OCR OUTPUT: ", ocr_inference(img, "korean"))
# words=text_inference(text, "korean")
# print("TEXT INPUT: ", text)
# print("WORD PARSING: ",words)
# print("flashcard output:",make_flashcards(words, "korean"))

examples = [
    [{"text": "@RolmOCR OCR the Text in the Image", "files": ["rolm/1.jpeg"]}],
    [{"text": "@RolmOCR Explain the Ad in Detail", "files": ["examples/videoplayback.mp4"]}],
    [{"text": "@RolmOCR OCR the Image", "files": ["rolm/3.jpeg"]}],
    [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
]

demo = gr.ChatInterface(
    fn=ocr_inference,
    description="# **Multimodal OCR `@RolmOCR and Default Qwen2VL OCR`**",
    examples=examples,
    textbox=gr.MultimodalTextbox(
        label="Query Input", 
        file_types=["image", "video"], 
        file_count="multiple", 
        placeholder="Use tag @RolmOCR for RolmOCR, or leave blank for default Qwen2VL OCR"
    ),
    stop_btn="Stop Generation",
    multimodal=True,
    cache_examples=False,
)

demo.launch(debug=True)