Video-to-Multilingual-OCR

Runtime error

App Files Files Community

stupidog04 commited on Apr 16, 2023

Commit

ddb4ddb

1 Parent(s): af0cbfe

Add enlarge box ratio to ui, plot for temporal profile

Browse files

Files changed (1) hide show

app.py +55 -28

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ from pathlib import Path
 import cv2
 import pandas as pd
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 #torch.hub.download_url_to_file('https://github.com/AaronCWacker/Yggdrasil/blob/main/images/BeautyIsTruthTruthisBeauty.JPG', 'BeautyIsTruthTruthisBeauty.JPG')
@@ -22,6 +23,17 @@ torch.hub.download_url_to_file('https://github.com/JaidedAI/EasyOCR/raw/master/e
 torch.hub.download_url_to_file('https://i.imgur.com/mwQFd7G.jpeg', 'Hindi.jpeg')
 def draw_boxes(image, bounds, color='yellow', width=2):
     draw = ImageDraw.Draw(image)
     for bound in bounds:
@@ -63,7 +75,20 @@ device = 'cuda' if torch.cuda.is_available() else 'cpu'
 processor = TrOCRProcessor.from_pretrained('microsoft/trocr-large-printed')
 model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-printed').to(device)
-def inference(video, lang, time_step, full_scan, number_filter, use_trocr, period_index):
     output = 'results.mp4'
     reader = easyocr.Reader(lang)
     bounds = []
@@ -73,7 +98,7 @@ def inference(video, lang, time_step, full_scan, number_filter, use_trocr, perio
     frame_rate = vidcap.get(cv2.CAP_PROP_FPS)
     output_frames = []
     temporal_profiles = []
-    compress_mp4 = False
     # Get the positions of the largest boxes in the first frame
     bounds = reader.readtext(frame)
@@ -91,27 +116,24 @@ def inference(video, lang, time_step, full_scan, number_filter, use_trocr, perio
     # Match bboxes to position and store the text read by OCR
     while success:
         if count % (int(frame_rate * time_step)) == 0:
-            if full_scan:
-                bounds = reader.readtext(frame)
-                for box in bounds:
                     bbox_pos = box_position(box)
                     for i, position in enumerate(positions):
                         distance = np.linalg.norm(np.array(bbox_pos) - np.array(position))
                         if distance < 50:
-                            temporal_profiles[i].append((count / frame_rate, box[1]))
-                            break
-            else:
-                for i, box in enumerate(largest_boxes):
-                    x1, y1 = box[0][0]
-                    x2, y2 = box[0][2]
-                    box_width = x2 - x1
-                    box_height = y2 - y1
-                    ratio = 0.2
-                    x1 = max(0, int(x1 - ratio * box_width))
-                    x2 = min(frame.shape[1], int(x2 + ratio * box_width))
-                    y1 = max(0, int(y1 - ratio * box_height))
-                    y2 = min(frame.shape[0], int(y2 + ratio * box_height))
-                    cropped_frame = frame[y1:y2, x1:x2]
                     if use_trocr:
                         pixel_values = processor(images=cropped_frame, return_tensors="pt").pixel_values
                         generated_ids = model.generate(pixel_values.to(device))
@@ -154,10 +176,10 @@ def inference(video, lang, time_step, full_scan, number_filter, use_trocr, perio
     # Draw boxes with box indices in the first frame of the output video
     im = Image.fromarray(output_frames[0])
     draw = ImageDraw.Draw(im)
-    font_size = 30
     font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
     for i, box in enumerate(largest_boxes):
-        draw.text((box_position(box)), f"Box {i+1}", fill='red', font=ImageFont.truetype(font_path, font_size))
     output_video.release()
     vidcap.release()
@@ -176,7 +198,10 @@ def inference(video, lang, time_step, full_scan, number_filter, use_trocr, perio
             df_list.append({"Box": f"Box {i+1}", "Time (s)": t, "Text": text})
         df_list.append({"Box": f"", "Time (s)": "", "Text": ""})
     df = pd.concat([pd.DataFrame(df_list)])
-    return output, im, df
 title = '🖼️Video to Multilingual OCR👁️Gradio'
@@ -184,7 +209,7 @@ description = 'Multilingual OCR which works conveniently on all devices in multi
 article = "<p style='text-align: center'></p>"
 examples = [
-['test.mp4',['en'],10,False,True,True,1]
 ]
 css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
@@ -205,16 +230,18 @@ gr.Interface(
     [
         gr.inputs.Video(label='Input Video'),
         gr.inputs.CheckboxGroup(choices, type="value", default=['en'], label='Language'),
-        gr.inputs.Number(label='Time Step (in seconds)', default=1.0),
         gr.inputs.Checkbox(label='Full Screen Scan'),
-        gr.inputs.Checkbox(label='Use TrOCR large (this is only available when Full Screen Scan is disable)'),
         gr.inputs.Checkbox(label='Number Filter (remove non-digit char and insert period)'),
-        gr.inputs.Textbox(label="period position",default=1)
     ],
     [
         gr.outputs.Video(label='Output Video'),
         gr.outputs.Image(label='Output Preview', type='numpy'),
-        gr.outputs.Dataframe(headers=['Box', 'Time (s)', 'Text'], type='pandas'),
     ],
     title=title,
     description=description,
@@ -222,4 +249,4 @@ gr.Interface(
     examples=examples,
     css=css,
     enable_queue=True
-).launch(debug=True)

 import cv2
 import pandas as pd
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+import matplotlib.pyplot as plt
 #torch.hub.download_url_to_file('https://github.com/AaronCWacker/Yggdrasil/blob/main/images/BeautyIsTruthTruthisBeauty.JPG', 'BeautyIsTruthTruthisBeauty.JPG')
 torch.hub.download_url_to_file('https://i.imgur.com/mwQFd7G.jpeg', 'Hindi.jpeg')
+def plot_temporal_profile(temporal_profile):
+    fig = plt.figure()
+    for i, profile in enumerate(temporal_profile):
+        x, y = zip(*profile)
+        plt.plot(x, y, label=f"Box {i+1}")
+    plt.title("Temporal Profiles")
+    plt.xlabel("Time (s)")
+    plt.ylabel("Value")
+    plt.legend()
+    return fig
 def draw_boxes(image, bounds, color='yellow', width=2):
     draw = ImageDraw.Draw(image)
     for bound in bounds:
 processor = TrOCRProcessor.from_pretrained('microsoft/trocr-large-printed')
 model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-printed').to(device)
+def process_box(box, frame, enlarge_ratio):
+    x1, y1 = box[0][0]
+    x2, y2 = box[0][2]
+    enlarge_ratio = enlarge_ratio/2
+    box_width = x2 - x1
+    box_height = y2 - y1
+    x1 = max(0, int(x1 - enlarge_ratio * box_width))
+    x2 = min(frame.shape[1], int(x2 + enlarge_ratio * box_width))
+    y1 = max(0, int(y1 - enlarge_ratio * box_height))
+    y2 = min(frame.shape[0], int(y2 + enlarge_ratio * box_height))
+    cropped_frame = frame[y1:y2, x1:x2]
+    return cropped_frame
+def inference(video, lang, full_scan, number_filter, use_trocr, time_step, period_index, box_enlarge_ratio=0.4):
     output = 'results.mp4'
     reader = easyocr.Reader(lang)
     bounds = []
     frame_rate = vidcap.get(cv2.CAP_PROP_FPS)
     output_frames = []
     temporal_profiles = []
+    compress_mp4 = True
     # Get the positions of the largest boxes in the first frame
     bounds = reader.readtext(frame)
     # Match bboxes to position and store the text read by OCR
     while success:
         if count % (int(frame_rate * time_step)) == 0:
+            bounds = reader.readtext(frame) if full_scan else largest_boxes
+            for i, box in enumerate(bounds):
+                if full_scan:
+                    # Match box to previous box
                     bbox_pos = box_position(box)
                     for i, position in enumerate(positions):
                         distance = np.linalg.norm(np.array(bbox_pos) - np.array(position))
                         if distance < 50:
+                            if use_trocr:
+                                cropped_frame = process_box(box, frame, enlarge_ratio=box_enlarge_ratio)
+                                pixel_values = processor(images=cropped_frame, return_tensors="pt").pixel_values
+                                generated_ids = model.generate(pixel_values.to(device))
+                                generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+                                temporal_profiles[i].append((count / frame_rate, generated_text))
+                            else:
+                                temporal_profiles[i].append((count / frame_rate, box[1]))
+                else:
+                    cropped_frame = process_box(box, frame, enlarge_ratio=box_enlarge_ratio)
                     if use_trocr:
                         pixel_values = processor(images=cropped_frame, return_tensors="pt").pixel_values
                         generated_ids = model.generate(pixel_values.to(device))
     # Draw boxes with box indices in the first frame of the output video
     im = Image.fromarray(output_frames[0])
     draw = ImageDraw.Draw(im)
+    font_size = 50
     font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
     for i, box in enumerate(largest_boxes):
+        draw.text((box_position(box)), f"{i+1}", fill='red', font=ImageFont.truetype(font_path, font_size))
     output_video.release()
     vidcap.release()
             df_list.append({"Box": f"Box {i+1}", "Time (s)": t, "Text": text})
         df_list.append({"Box": f"", "Time (s)": "", "Text": ""})
     df = pd.concat([pd.DataFrame(df_list)])
+    # generate the plot of temporal profile
+    plot_fig = plot_temporal_profile(temporal_profiles)
+    return output, im, plot_fig, df
 title = '🖼️Video to Multilingual OCR👁️Gradio'
 article = "<p style='text-align: center'></p>"
 examples = [
+['test.mp4',['en'],False,True,True,10,1,0.4]
 ]
 css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
     [
         gr.inputs.Video(label='Input Video'),
         gr.inputs.CheckboxGroup(choices, type="value", default=['en'], label='Language'),
         gr.inputs.Checkbox(label='Full Screen Scan'),
+        gr.inputs.Checkbox(label='Use TrOCR large'),
         gr.inputs.Checkbox(label='Number Filter (remove non-digit char and insert period)'),
+        gr.inputs.Number(label='Time Step (in seconds)', default=1.0),
+        gr.inputs.Number(label="period position",default=1),
+        gr.inputs.Number(label='Box enlarge ratio', default=0.4)
     ],
     [
         gr.outputs.Video(label='Output Video'),
         gr.outputs.Image(label='Output Preview', type='numpy'),
+        gr.Plot(label='Temporal Profile'),
+        gr.outputs.Dataframe(headers=['Box', 'Time (s)', 'Text'], type='pandas',  max_rows=15)
     ],
     title=title,
     description=description,
     examples=examples,
     css=css,
     enable_queue=True
+).launch(debug=True, share=True)