File size: 11,238 Bytes
f08abae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
import gradio as gr
from PIL import Image, ImageDraw, ImageFont
import xml.etree.ElementTree as ET
import os

# --- Helper Functions ---

def get_alto_namespace(xml_file_path):
    """
    Dynamically gets the ALTO namespace from the XML file.
    """
    try:
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
        # The namespace is usually defined in the root <alto> tag
        # e.g., xmlns="http://www.loc.gov/standards/alto/v3/alto.xsd"
        # ET.ElementTree prepends this as {namespace_uri}tag
        if '}' in root.tag:
            return root.tag.split('}')[0] + '}' # e.g., {http://www.loc.gov/standards/alto/v3/alto.xsd}
    except ET.ParseError:
        print(f"Error parsing XML to find namespace: {xml_file_path}")
    return '' # Default to no namespace if not found or error

def parse_alto_xml(xml_file_path):
    """
    Parses an ALTO XML file to extract text content and bounding box info.
    Returns:
        - full_text (str): All extracted text concatenated.
        - ocr_data (list): A list of dictionaries, each with
                           {'text': str, 'x': int, 'y': int, 'w': int, 'h': int}
    """
    full_text_lines = []
    ocr_data = []
    
    if not xml_file_path or not os.path.exists(xml_file_path):
        return "Error: XML file not provided or does not exist.", []

    try:
        # Dynamically determine the namespace
        ns_prefix = get_alto_namespace(xml_file_path)
        
        tree = ET.parse(xml_file_path)
        root = tree.getroot()

        # Find all TextLine elements
        for text_line in root.findall(f'.//{ns_prefix}TextLine'):
            line_text_parts = []
            for string_element in text_line.findall(f'{ns_prefix}String'):
                text = string_element.get('CONTENT')
                if text: # Ensure text is not None
                    line_text_parts.append(text)
                    try:
                        hpos = int(float(string_element.get('HPOS')))
                        vpos = int(float(string_element.get('VPOS')))
                        width = int(float(string_element.get('WIDTH')))
                        height = int(float(string_element.get('HEIGHT')))
                        ocr_data.append({
                            'text': text,
                            'x': hpos,
                            'y': vpos,
                            'w': width,
                            'h': height
                        })
                    except (ValueError, TypeError) as e:
                        print(f"Warning: Could not parse coordinates for '{text}': {e}")
                        # Add with default/placeholder values if needed, or skip
                        ocr_data.append({
                            'text': text, 'x': 0, 'y': 0, 'w': 10, 'h': 10 # Placeholder
                        })
            if line_text_parts:
                full_text_lines.append(" ".join(line_text_parts))
        
        return "\n".join(full_text_lines), ocr_data

    except ET.ParseError as e:
        return f"Error parsing XML: {e}", []
    except Exception as e:
        return f"An unexpected error occurred during XML parsing: {e}", []


def draw_ocr_on_image(image_pil, ocr_data):
    """
    Draws bounding boxes and text from ocr_data onto the image.
    Args:
        image_pil (PIL.Image): The image to draw on.
        ocr_data (list): List of OCR data dictionaries.
    Returns:
        PIL.Image: Image with overlays.
    """
    if not image_pil or not ocr_data:
        return image_pil # Return original image if no data or image

    draw = ImageDraw.Draw(image_pil)
    
    # Try to load a font, fallback to default if not found
    try:
        # Adjust font size based on average box height or fixed small size
        avg_height = sum(d['h'] for d in ocr_data if d['h'] > 0) / len(ocr_data) if ocr_data else 10
        font_size = max(8, int(avg_height * 0.6)) # Heuristic for font size
        font = ImageFont.truetype("arial.ttf", font_size)
    except IOError:
        font = ImageFont.load_default()
        font_size = 10 # Default font is usually small
        print("Arial font not found, using default font.")

    for item in ocr_data:
        x, y, w, h = item['x'], item['y'], item['w'], item['h']
        text = item['text']

        # Draw bounding box
        draw.rectangle([(x, y), (x + w, y + h)], outline="red", width=2)
        
        # Draw text (slightly offset for better visibility, handle multi-line if necessary)
        # Simple text drawing; for complex layouts, more sophisticated placement is needed
        text_position = (x + 2, y - font_size - 2 if y - font_size - 2 > 0 else y + 2) # Above or below
        
        # Optional: Draw a small background for text for better readability
        # text_bbox = draw.textbbox(text_position, text, font=font)
        # draw.rectangle(text_bbox, fill="rgba(255,255,255,0.7)")
        
        draw.text(text_position, text, fill="green", font=font)
        
    return image_pil

# --- Gradio Interface Function ---

def process_image_and_xml(image_file, xml_file, show_overlay):
    """
    Main function for the Gradio interface.
    """
    if image_file is None:
        return None, "Please upload an image.", None
    if xml_file is None:
        return Image.open(image_file.name), "Please upload an OCR XML file.", None

    try:
        # Load the image
        img_pil = Image.open(image_file.name).convert("RGB") # Ensure RGB for drawing
    except Exception as e:
        return None, f"Error loading image: {e}", None

    # Parse XML
    extracted_text, ocr_box_data = parse_alto_xml(xml_file.name)

    overlay_image_pil = None
    if show_overlay and ocr_box_data:
        # Create a copy for drawing to keep the original clean for the first output
        img_for_overlay = img_pil.copy()
        overlay_image_pil = draw_ocr_on_image(img_for_overlay, ocr_box_data)
    elif show_overlay and not ocr_box_data and not extracted_text.startswith("Error"):
        # If overlay is checked but no bounding boxes (e.g. empty XML or parsing issue not caught as error string)
        extracted_text += "\n(No bounding box data found or parsed for overlay)"


    return img_pil, extracted_text, overlay_image_pil


# --- Create Gradio App ---

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# OCR Viewer (ALTO XML)")
    gr.Markdown(
        "Upload an image and its corresponding ALTO OCR XML file. "
        "The app will display the image, extract and show the plain text, "
        "and optionally overlay the OCR predictions on the image."
    )

    with gr.Row():
        with gr.Column(scale=1):
            image_input = gr.File(label="Upload Image (PNG, JPG, etc.)", type="file") # Using type="file" for path
            xml_input = gr.File(label="Upload ALTO XML File (.xml)", type="file")
            show_overlay_checkbox = gr.Checkbox(label="Show OCR Overlay on Image", value=False)
            submit_button = gr.Button("Process Files", variant="primary")

    with gr.Row():
        with gr.Column(scale=1):
            output_image_orig = gr.Image(label="Uploaded Image", type="pil", interactive=False)
        with gr.Column(scale=1):
            output_text = gr.Textbox(label="Extracted Plain Text", lines=15, interactive=False)
    
    output_image_overlay = gr.Image(label="Image with OCR Overlay", type="pil", interactive=False, visible=True) # Always visible, content changes

    def update_interface(image_f, xml_f, show_overlay_val):
        if image_f is None or xml_f is None:
            # Handle cases where one or both files are not yet uploaded
            img_to_show = Image.open(image_f.name).convert("RGB") if image_f else None
            text_to_show = "Please upload both an image and an XML file."
            overlay_to_show = None
            if image_f is None:
                text_to_show = "Please upload an image file."
            elif xml_f is None:
                text_to_show = "Please upload an XML file."
            
            return img_to_show, text_to_show, overlay_to_show

        img, text, overlay_img = process_image_and_xml(image_f, xml_f, show_overlay_val)
        
        # If "show overlay" is not checked, overlay_img will be None from process_image_and_xml
        # The gr.Image component will handle None by showing nothing or a placeholder.
        return img, text, overlay_img

    submit_button.click(
        fn=update_interface,
        inputs=[image_input, xml_input, show_overlay_checkbox],
        outputs=[output_image_orig, output_text, output_image_overlay]
    )
    
    # Also update if the checkbox changes, provided files are already there
    show_overlay_checkbox.change(
        fn=update_interface,
        inputs=[image_input, xml_input, show_overlay_checkbox],
        outputs=[output_image_orig, output_text, output_image_overlay]
    )

    gr.Markdown("---")
    gr.Markdown("### Example ALTO XML Snippet (for `String` element extraction):")
    gr.Code(
        language="xml",
        value="""
<alto xmlns="http://www.loc.gov/standards/alto/v3/alto.xsd">
  <Description>...</Description>
  <Styles>...</Styles>
  <Layout>
    <Page ID="Page13" PHYSICAL_IMG_NR="13" WIDTH="2394" HEIGHT="3612">
      <PrintSpace>
        <TextLine WIDTH="684" HEIGHT="108" ID="p13_t1" HPOS="465" VPOS="196">
          <String ID="p13_w1" CONTENT="Introduction" HPOS="465" VPOS="196" WIDTH="684" HEIGHT="108" STYLEREFS="font0"/>
        </TextLine>
        <TextLine WIDTH="1798" HEIGHT="51" ID="p13_t2" HPOS="492" VPOS="523">
          <String ID="p13_w2" CONTENT="Britain" HPOS="492" VPOS="523" WIDTH="166" HEIGHT="51" STYLEREFS="font1"/>
          <SP WIDTH="24" VPOS="523" HPOS="658"/>
          <String ID="p13_w3" CONTENT="1981" HPOS="682" VPOS="523" WIDTH="117" HEIGHT="51" STYLEREFS="font1"/>
          <!-- ... more String and SP elements ... -->
        </TextLine>
        <!-- ... more TextLine elements ... -->
      </PrintSpace>
    </Page>
  </Layout>
</alto>
        """,
        interactive=False
    )


if __name__ == "__main__":
    # To test, create a dummy image and use the XML from your example.
    # Save the XML content you provided as "189819724.34.xml" in the same directory.
    # Create a dummy image, e.g., "dummy_image.png"
    try:
        from PIL import Image as PImage
        img = PImage.new('RGB', (2394, 3612), color = 'lightgray') # Dimensions from example XML
        # Optionally add some text to image if you want to see if boxes align (roughly)
        # d = ImageDraw.Draw(img)
        # d.text((500,200), "Test Image", fill=(0,0,0))
        img.save("dummy_image.png")
        print("Created dummy_image.png for testing.")
        
        # Ensure the example XML file (189819724.34.xml) exists in the same directory
        # or provide the correct path if it's elsewhere.
        if not os.path.exists("189819724.34.xml"):
            print("WARNING: Example XML '189819724.34.xml' not found. Please create it or upload your own.")

    except ImportError:
        print("Pillow not installed, can't create dummy image.")
    except Exception as e:
        print(f"Error creating dummy image: {e}")

    demo.launch()