File size: 2,580 Bytes
559732c
 
 
51d22a6
559732c
 
 
 
 
 
c86b97e
 
 
559732c
51d22a6
559732c
51d22a6
559732c
 
 
51d22a6
c86b97e
559732c
51d22a6
559732c
 
 
 
51d22a6
559732c
 
 
 
51d22a6
559732c
 
 
 
51d22a6
559732c
 
 
51d22a6
 
 
559732c
 
 
51d22a6
559732c
 
 
 
 
51d22a6
559732c
 
 
 
 
 
 
 
 
 
 
51d22a6
559732c
 
 
 
 
 
 
 
 
51d22a6
559732c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import gradio as gr
from transformers import AutoModel, AutoTokenizer
import os
import re  

# Load the OCR model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', 
                                  trust_remote_code=True, 
                                  low_cpu_mem_usage=True, 
                                  device_map='cuda', 
                                  use_safetensors=True, 
                                  pad_token_id=tokenizer.eos_token_id).eval().cuda()


def extract_text_from_image(image):
    
    image_path = "temp_image.jpg"
    image.save(image_path)
    
    
    extracted_text = model.chat(tokenizer, image_path, ocr_type='ocr')
    
    
    os.remove(image_path)
    
    return extracted_text


def search_and_highlight_keyword(extracted_text, keyword):
    if not keyword:
        return "<p>Please provide a keyword for searching.</p>"
    
    
    def highlight(match):
        # Custom background color and text color for highlighting
        return f"<mark style='background-color: #ffcc00; color: black;'>{match.group(0)}</mark>"
    
    
    pattern = re.compile(re.escape(keyword), re.IGNORECASE)
    
    highlighted_text = []
    for line in extracted_text.splitlines():  
        if re.search(pattern, line):  
            highlighted_line = re.sub(pattern, highlight, line)  
            highlighted_text.append(highlighted_line)
    
    if highlighted_text:
        return '<br>'.join(highlighted_text)  
    else:
        return f"<p>Keyword '{keyword}' not found in the text.</p>"

# Gradio interface components
with gr.Blocks() as demo:
    
    gr.Markdown("# OCR and Keyword Search App with Highlighting")
    
    image_input = gr.Image(type="pil", label="Upload an Image (JPEG format)")
    text_output = gr.Textbox(label="Extracted Text", placeholder="Text will appear here after OCR.")
    
    extract_button = gr.Button("Extract Text")
    
    extract_button.click(fn=extract_text_from_image, 
                         inputs=image_input, 
                         outputs=text_output)
    
    
    keyword_input = gr.Textbox(label="Enter Keyword to Search and Highlight")
    search_result = gr.HTML(label="Highlighted Text with Keyword")
    
    search_button = gr.Button("Search and Highlight Keyword")
    
    search_button.click(fn=search_and_highlight_keyword, 
                        inputs=[text_output, keyword_input], 
                        outputs=search_result)


demo.launch(share=True)