Spaces:

dimasdeffieux
/

explain_lang

Sleeping

App Files Files Community

explain_lang / app.py

dimasdeffieux

Update app.py

8b7c1f5 verified 4 months ago

raw

history blame

2.81 kB


	import requests
	from paddleocr import PaddleOCR, draw_ocr
	from PIL import Image
	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch

	img = "input_data/ocr_input/japan1.jpg"
	text = "표현이 서툰 것도 잘못인가요. 나 차가운 도시에 따뜻한 여잔데. 그냥 좋아한단 말도 안 되는가요. 솔직하게 난 말하고 싶어요"

	model_id = "deepseek-ai/deepseek-llm-7b-chat"
	tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)

	def text_inference(text, language):
	system_prompt = (
	f"Given the following {language} text, extract all words in their base (dictionary) form, including verbs, adjectives, nouns, and particles. "
	"Remove all duplicates. Return the base form words as a comma-separated list, and nothing else."
	)
	user_prompt = f"{system_prompt}\n\nText:\n{text}"

	input_ids = tokenizer.apply_chat_template([{"role": "user", "content": user_prompt}], return_tensors="pt").to(model.device)
	output_ids = model.generate(input_ids, max_new_tokens=256)
	output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

	# Parse response: take last line, split by commas
	last_line = output_text.strip().split("\n")[-1]
	words = [w.strip() for w in last_line.split(",") if w.strip()]
	return words

	def ocr_inference(img, lang):
	ocr = PaddleOCR(use_angle_cls=True, lang=lang,use_gpu=False)
	img_path = img
	result = ocr.ocr(img_path, cls=True)[0]
	image = Image.open(img_path).convert('RGB')
	boxes = [line[0] for line in result]
	txts = [line[1][0] for line in result]
	scores = [line[1][1] for line in result]
	return txts

	def make_flashcards(words, language):

	system_prompt = (
	f"Given the following {language} words, define each word and create an example sentence using the word, with an explanation. do this for every word. Respond with the word, the definition, the example sentence, and the translation of the example sentence"

	)
	user_prompt = f"{system_prompt}\n\nText:\n{text}"

	input_ids = tokenizer.apply_chat_template([{"role": "user", "content": user_prompt}], return_tensors="pt").to(model.device)
	output_ids = model.generate(input_ids, max_new_tokens=256)
	output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

	# Parse response: take last line, split by commas
	last_line = output_text.strip().split("\n")[-1]
	output = [w.strip() for w in last_line.split(":") if w.strip()]
	return output


	words=text_inference(text, "korean")
	print("OUTPUT TOUT OUETOI EIFJ IEFJ",words)
	print("flashcard output:",make_flashcards(words, "korean"))