Spaces:
Running
on
Zero
Running
on
Zero
File size: 11,238 Bytes
f08abae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 |
import gradio as gr
from PIL import Image, ImageDraw, ImageFont
import xml.etree.ElementTree as ET
import os
# --- Helper Functions ---
def get_alto_namespace(xml_file_path):
"""
Dynamically gets the ALTO namespace from the XML file.
"""
try:
tree = ET.parse(xml_file_path)
root = tree.getroot()
# The namespace is usually defined in the root <alto> tag
# e.g., xmlns="http://www.loc.gov/standards/alto/v3/alto.xsd"
# ET.ElementTree prepends this as {namespace_uri}tag
if '}' in root.tag:
return root.tag.split('}')[0] + '}' # e.g., {http://www.loc.gov/standards/alto/v3/alto.xsd}
except ET.ParseError:
print(f"Error parsing XML to find namespace: {xml_file_path}")
return '' # Default to no namespace if not found or error
def parse_alto_xml(xml_file_path):
"""
Parses an ALTO XML file to extract text content and bounding box info.
Returns:
- full_text (str): All extracted text concatenated.
- ocr_data (list): A list of dictionaries, each with
{'text': str, 'x': int, 'y': int, 'w': int, 'h': int}
"""
full_text_lines = []
ocr_data = []
if not xml_file_path or not os.path.exists(xml_file_path):
return "Error: XML file not provided or does not exist.", []
try:
# Dynamically determine the namespace
ns_prefix = get_alto_namespace(xml_file_path)
tree = ET.parse(xml_file_path)
root = tree.getroot()
# Find all TextLine elements
for text_line in root.findall(f'.//{ns_prefix}TextLine'):
line_text_parts = []
for string_element in text_line.findall(f'{ns_prefix}String'):
text = string_element.get('CONTENT')
if text: # Ensure text is not None
line_text_parts.append(text)
try:
hpos = int(float(string_element.get('HPOS')))
vpos = int(float(string_element.get('VPOS')))
width = int(float(string_element.get('WIDTH')))
height = int(float(string_element.get('HEIGHT')))
ocr_data.append({
'text': text,
'x': hpos,
'y': vpos,
'w': width,
'h': height
})
except (ValueError, TypeError) as e:
print(f"Warning: Could not parse coordinates for '{text}': {e}")
# Add with default/placeholder values if needed, or skip
ocr_data.append({
'text': text, 'x': 0, 'y': 0, 'w': 10, 'h': 10 # Placeholder
})
if line_text_parts:
full_text_lines.append(" ".join(line_text_parts))
return "\n".join(full_text_lines), ocr_data
except ET.ParseError as e:
return f"Error parsing XML: {e}", []
except Exception as e:
return f"An unexpected error occurred during XML parsing: {e}", []
def draw_ocr_on_image(image_pil, ocr_data):
"""
Draws bounding boxes and text from ocr_data onto the image.
Args:
image_pil (PIL.Image): The image to draw on.
ocr_data (list): List of OCR data dictionaries.
Returns:
PIL.Image: Image with overlays.
"""
if not image_pil or not ocr_data:
return image_pil # Return original image if no data or image
draw = ImageDraw.Draw(image_pil)
# Try to load a font, fallback to default if not found
try:
# Adjust font size based on average box height or fixed small size
avg_height = sum(d['h'] for d in ocr_data if d['h'] > 0) / len(ocr_data) if ocr_data else 10
font_size = max(8, int(avg_height * 0.6)) # Heuristic for font size
font = ImageFont.truetype("arial.ttf", font_size)
except IOError:
font = ImageFont.load_default()
font_size = 10 # Default font is usually small
print("Arial font not found, using default font.")
for item in ocr_data:
x, y, w, h = item['x'], item['y'], item['w'], item['h']
text = item['text']
# Draw bounding box
draw.rectangle([(x, y), (x + w, y + h)], outline="red", width=2)
# Draw text (slightly offset for better visibility, handle multi-line if necessary)
# Simple text drawing; for complex layouts, more sophisticated placement is needed
text_position = (x + 2, y - font_size - 2 if y - font_size - 2 > 0 else y + 2) # Above or below
# Optional: Draw a small background for text for better readability
# text_bbox = draw.textbbox(text_position, text, font=font)
# draw.rectangle(text_bbox, fill="rgba(255,255,255,0.7)")
draw.text(text_position, text, fill="green", font=font)
return image_pil
# --- Gradio Interface Function ---
def process_image_and_xml(image_file, xml_file, show_overlay):
"""
Main function for the Gradio interface.
"""
if image_file is None:
return None, "Please upload an image.", None
if xml_file is None:
return Image.open(image_file.name), "Please upload an OCR XML file.", None
try:
# Load the image
img_pil = Image.open(image_file.name).convert("RGB") # Ensure RGB for drawing
except Exception as e:
return None, f"Error loading image: {e}", None
# Parse XML
extracted_text, ocr_box_data = parse_alto_xml(xml_file.name)
overlay_image_pil = None
if show_overlay and ocr_box_data:
# Create a copy for drawing to keep the original clean for the first output
img_for_overlay = img_pil.copy()
overlay_image_pil = draw_ocr_on_image(img_for_overlay, ocr_box_data)
elif show_overlay and not ocr_box_data and not extracted_text.startswith("Error"):
# If overlay is checked but no bounding boxes (e.g. empty XML or parsing issue not caught as error string)
extracted_text += "\n(No bounding box data found or parsed for overlay)"
return img_pil, extracted_text, overlay_image_pil
# --- Create Gradio App ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# OCR Viewer (ALTO XML)")
gr.Markdown(
"Upload an image and its corresponding ALTO OCR XML file. "
"The app will display the image, extract and show the plain text, "
"and optionally overlay the OCR predictions on the image."
)
with gr.Row():
with gr.Column(scale=1):
image_input = gr.File(label="Upload Image (PNG, JPG, etc.)", type="file") # Using type="file" for path
xml_input = gr.File(label="Upload ALTO XML File (.xml)", type="file")
show_overlay_checkbox = gr.Checkbox(label="Show OCR Overlay on Image", value=False)
submit_button = gr.Button("Process Files", variant="primary")
with gr.Row():
with gr.Column(scale=1):
output_image_orig = gr.Image(label="Uploaded Image", type="pil", interactive=False)
with gr.Column(scale=1):
output_text = gr.Textbox(label="Extracted Plain Text", lines=15, interactive=False)
output_image_overlay = gr.Image(label="Image with OCR Overlay", type="pil", interactive=False, visible=True) # Always visible, content changes
def update_interface(image_f, xml_f, show_overlay_val):
if image_f is None or xml_f is None:
# Handle cases where one or both files are not yet uploaded
img_to_show = Image.open(image_f.name).convert("RGB") if image_f else None
text_to_show = "Please upload both an image and an XML file."
overlay_to_show = None
if image_f is None:
text_to_show = "Please upload an image file."
elif xml_f is None:
text_to_show = "Please upload an XML file."
return img_to_show, text_to_show, overlay_to_show
img, text, overlay_img = process_image_and_xml(image_f, xml_f, show_overlay_val)
# If "show overlay" is not checked, overlay_img will be None from process_image_and_xml
# The gr.Image component will handle None by showing nothing or a placeholder.
return img, text, overlay_img
submit_button.click(
fn=update_interface,
inputs=[image_input, xml_input, show_overlay_checkbox],
outputs=[output_image_orig, output_text, output_image_overlay]
)
# Also update if the checkbox changes, provided files are already there
show_overlay_checkbox.change(
fn=update_interface,
inputs=[image_input, xml_input, show_overlay_checkbox],
outputs=[output_image_orig, output_text, output_image_overlay]
)
gr.Markdown("---")
gr.Markdown("### Example ALTO XML Snippet (for `String` element extraction):")
gr.Code(
language="xml",
value="""
<alto xmlns="http://www.loc.gov/standards/alto/v3/alto.xsd">
<Description>...</Description>
<Styles>...</Styles>
<Layout>
<Page ID="Page13" PHYSICAL_IMG_NR="13" WIDTH="2394" HEIGHT="3612">
<PrintSpace>
<TextLine WIDTH="684" HEIGHT="108" ID="p13_t1" HPOS="465" VPOS="196">
<String ID="p13_w1" CONTENT="Introduction" HPOS="465" VPOS="196" WIDTH="684" HEIGHT="108" STYLEREFS="font0"/>
</TextLine>
<TextLine WIDTH="1798" HEIGHT="51" ID="p13_t2" HPOS="492" VPOS="523">
<String ID="p13_w2" CONTENT="Britain" HPOS="492" VPOS="523" WIDTH="166" HEIGHT="51" STYLEREFS="font1"/>
<SP WIDTH="24" VPOS="523" HPOS="658"/>
<String ID="p13_w3" CONTENT="1981" HPOS="682" VPOS="523" WIDTH="117" HEIGHT="51" STYLEREFS="font1"/>
<!-- ... more String and SP elements ... -->
</TextLine>
<!-- ... more TextLine elements ... -->
</PrintSpace>
</Page>
</Layout>
</alto>
""",
interactive=False
)
if __name__ == "__main__":
# To test, create a dummy image and use the XML from your example.
# Save the XML content you provided as "189819724.34.xml" in the same directory.
# Create a dummy image, e.g., "dummy_image.png"
try:
from PIL import Image as PImage
img = PImage.new('RGB', (2394, 3612), color = 'lightgray') # Dimensions from example XML
# Optionally add some text to image if you want to see if boxes align (roughly)
# d = ImageDraw.Draw(img)
# d.text((500,200), "Test Image", fill=(0,0,0))
img.save("dummy_image.png")
print("Created dummy_image.png for testing.")
# Ensure the example XML file (189819724.34.xml) exists in the same directory
# or provide the correct path if it's elsewhere.
if not os.path.exists("189819724.34.xml"):
print("WARNING: Example XML '189819724.34.xml' not found. Please create it or upload your own.")
except ImportError:
print("Pillow not installed, can't create dummy image.")
except Exception as e:
print(f"Error creating dummy image: {e}")
demo.launch() |