Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,894 Bytes
7c7f70a d40b8d7 7c7f70a 4db061f 142c37c 4db061f 7c7f70a 5c990d6 7c7f70a bd743d7 1a02ee6 7c7f70a 5c990d6 7c7f70a d40b8d7 7c7f70a 1c7a2ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
ABOUT = """
# TB-OCR Preview 0.1 Unofficial Demo
This is an unofficial demo of [yifeihu/TB-OCR-preview-0.1](https://huggingface.co/yifeihu/TB-OCR-preview-0.1).
Overview of TB-OCR:
> TB-OCR-preview (Text Block OCR), created by [Yifei Hu](https://x.com/hu_yifei), is an end-to-end OCR model handling text, math latex, and markdown formats all at once. The model takes a block of text as the input and returns clean markdown output. Headers are marked with `##`. Math expressions are guaranteed to be wrapped in brackets `\( inline math \) \[ display math \]` for easier parsing. This model does not require line-detection or math formula detection.
(From the [model card](https://huggingface.co/yifeihu/TB-OCR-preview-0.1))
"""
# check out https://huggingface.co/microsoft/Phi-3.5-vision-instruct for more details
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
from PIL import Image
import requests
model_id = "yifeihu/TB-OCR-preview-0.1"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if not torch.cuda.is_available():
ABOUT += "\n\n### This demo is running on CPU\n\nThis demo is running on CPU, it will be very slow. Consider duplicating it or running it locally to skip the queue and for faster response times."
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map=DEVICE,
trust_remote_code=True,
torch_dtype="auto",
# _attn_implementation='flash_attention_2',
#load_in_4bit=True # Optional: Load model in 4-bit mode to save memory
)
processor = AutoProcessor.from_pretrained(model_id,
trust_remote_code=True,
num_crops=16
)
def phi_ocr(image_url):
question = "Convert the text to markdown format."
image = Image.open(image_url)
prompt_message = [{
'role': 'user',
'content': f'<|image_1|>\n{question}',
}]
prompt = processor.tokenizer.apply_chat_template(prompt_message, tokenize=False, add_generation_prompt=True)
inputs = processor(prompt, [image], return_tensors="pt").to(DEVICE)
generation_args = {
"max_new_tokens": 1024,
"temperature": 0.1,
"do_sample": False
}
generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
response = response.split("<image_end>")[0] # remove the image_end token
return response
import gradio as gr
with gr.Blocks() as demo:
gr.Markdown(ABOUT)
with gr.Row():
with gr.Column():
img = gr.Image(label="Input image", type="filepath")
btn = gr.Button("OCR")
with gr.Column():
out = gr.Markdown()
btn.click(phi_ocr, inputs=img, outputs=out)
demo.queue().launch() |