Spaces:

dimasdeffieux
/

explain_lang

Sleeping

App Files Files Community

explain_lang / app.py

dimasdeffieux

Update app.py

f979edc verified 4 months ago

raw

history blame

1.92 kB


	import requests
	from paddleocr import PaddleOCR, draw_ocr
	from PIL import Image
	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch

	img = "input_data/ocr_input/japan1.jpg"
	text = "표현이 서툰 것도 잘못인가요. 나 차가운 도시에 따뜻한 여잔데. 그냥 좋아한단 말도 안 되는가요. 솔직하게 난 말하고 싶어요"

	model_id = "deepseek-ai/deepseek-llm-7b-chat"
	tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)

	def text_inference(text, language):
	system_prompt = (
	f"Given the following {language} text, extract all words in their base (dictionary) form, including verbs, adjectives, nouns, and particles. "
	"Remove all duplicates. Return the base form words as a comma-separated list, and nothing else."
	)
	user_prompt = f"{system_prompt}\n\nText:\n{text}"

	input_ids = tokenizer.apply_chat_template([{"role": "user", "content": user_prompt}], return_tensors="pt").to(model.device)
	output_ids = model.generate(input_ids, max_new_tokens=256)
	output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

	# Parse response: take last line, split by commas
	last_line = output_text.strip().split("\n")[-1]
	words = [w.strip() for w in last_line.split(",") if w.strip()]
	return words

	def ocr_inference(img, lang):
	ocr = PaddleOCR(use_angle_cls=True, lang=lang,use_gpu=False)
	img_path = img
	result = ocr.ocr(img_path, cls=True)[0]
	image = Image.open(img_path).convert('RGB')
	boxes = [line[0] for line in result]
	txts = [line[1][0] for line in result]
	scores = [line[1][1] for line in result]
	return txts

	def make_flashcards(words):
	pass;

	print("OUTPUT TOUT OUETOI EIFJ IEFJ",text_inference(text, "korean"))