omvishesh commited on
Commit
559732c
·
verified ·
1 Parent(s): 715fb0b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -0
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoModel, AutoTokenizer
3
+ import os
4
+ import re # Import regular expressions module
5
+
6
+ # Load the OCR model and tokenizer
7
+ tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
8
+ model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0',
9
+ trust_remote_code=True,
10
+ low_cpu_mem_usage=True,
11
+ device_map='cuda',
12
+ use_safetensors=True,
13
+ pad_token_id=tokenizer.eos_token_id).eval().cuda()
14
+
15
+ # Define the function to process images and extract text
16
+ def extract_text_from_image(image):
17
+ # Save the uploaded image temporarily
18
+ image_path = "temp_image.jpg"
19
+ image.save(image_path)
20
+
21
+ # Call the model to perform OCR
22
+ extracted_text = model.chat(tokenizer, image_path, ocr_type='ocr')
23
+
24
+ # Remove the temporary image file
25
+ os.remove(image_path)
26
+
27
+ return extracted_text
28
+
29
+ # Function to search for the keyword in extracted text and highlight it
30
+ def search_and_highlight_keyword(extracted_text, keyword):
31
+ if not keyword:
32
+ return "<p>Please provide a keyword for searching.</p>"
33
+
34
+ # Case-insensitive search and replace keyword with <mark> tag for highlighting
35
+ def highlight(match):
36
+ # Custom background color and text color for highlighting
37
+ return f"<mark style='background-color: #ffcc00; color: black;'>{match.group(0)}</mark>"
38
+
39
+ # Use regular expression to find the keyword in a case-insensitive manner
40
+ pattern = re.compile(re.escape(keyword), re.IGNORECASE)
41
+
42
+ highlighted_text = []
43
+ for line in extracted_text.splitlines(): # Split text into lines
44
+ if re.search(pattern, line): # If keyword is found in the line
45
+ highlighted_line = re.sub(pattern, highlight, line) # Highlight keyword
46
+ highlighted_text.append(highlighted_line)
47
+
48
+ if highlighted_text:
49
+ return '<br>'.join(highlighted_text) # Join the lines with HTML <br> for line breaks
50
+ else:
51
+ return f"<p>Keyword '{keyword}' not found in the text.</p>"
52
+
53
+ # Gradio interface components
54
+ with gr.Blocks() as demo:
55
+ # Image upload and OCR
56
+ gr.Markdown("# OCR and Keyword Search App with Highlighting")
57
+
58
+ image_input = gr.Image(type="pil", label="Upload an Image (JPEG format)")
59
+ text_output = gr.Textbox(label="Extracted Text", placeholder="Text will appear here after OCR.")
60
+
61
+ extract_button = gr.Button("Extract Text")
62
+
63
+ extract_button.click(fn=extract_text_from_image,
64
+ inputs=image_input,
65
+ outputs=text_output)
66
+
67
+ # Keyword search and highlight
68
+ keyword_input = gr.Textbox(label="Enter Keyword to Search and Highlight")
69
+ search_result = gr.HTML(label="Highlighted Text with Keyword")
70
+
71
+ search_button = gr.Button("Search and Highlight Keyword")
72
+
73
+ search_button.click(fn=search_and_highlight_keyword,
74
+ inputs=[text_output, keyword_input],
75
+ outputs=search_result)
76
+
77
+ # Launch the Gradio app
78
+ demo.launch(share=True)