DocQA_Agent / app.py
OmidSakaki's picture
Update app.py
2bf547d verified
raw
history blame
3.57 kB
import gradio as gr
import time
from PIL import Image
from paddleocr import PaddleOCR
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import pytesseract
import numpy as np
# Initialize models
paddle_ocr = PaddleOCR(lang='fa', use_textline_orientation=True)
trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
def run_paddleocr(image):
"""Run PaddleOCR on image"""
image_path = "temp.jpg"
image.save(image_path)
result = paddle_ocr.ocr(image_path, cls=True)
text = ' '.join([line[1][0] for line in result[0]]) if result else ''
return text
def run_trocr(image):
"""Run TrOCR on image"""
pixel_values = trocr_processor(image, return_tensors="pt").pixel_values
generated_ids = trocr_model.generate(pixel_values)
return trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
def run_tesseract(image):
"""Run Tesseract OCR on image"""
return pytesseract.image_to_string(image, lang='fas')
def compare_models(image):
"""Compare all three OCR models"""
# Convert to RGB if needed
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
image = image.convert("RGB")
results = {}
# Run PaddleOCR
start = time.time()
results['PaddleOCR'] = run_paddleocr(image)
paddle_time = time.time() - start
# Run TrOCR
start = time.time()
results['TrOCR'] = run_trocr(image)
trocr_time = time.time() - start
# Run Tesseract
start = time.time()
results['Tesseract'] = run_tesseract(image)
tesseract_time = time.time() - start
# Create comparison table
comparison = f"""
<table>
<tr>
<th>مدل</th>
<th>متن استخراج شده</th>
<th>زمان پردازش (ثانیه)</th>
</tr>
<tr>
<td>PaddleOCR</td>
<td>{results['PaddleOCR']}</td>
<td>{paddle_time:.2f}</td>
</tr>
<tr>
<td>TrOCR</td>
<td>{results['TrOCR']}</td>
<td>{trocr_time:.2f}</td>
</tr>
<tr>
<td>Tesseract</td>
<td>{results['Tesseract']}</td>
<td>{tesseract_time:.2f}</td>
</tr>
</table>
"""
return comparison, results['PaddleOCR'], results['TrOCR'], results['Tesseract']
# Create Gradio interface
with gr.Blocks(title="مقایسه مدل‌های OCR فارسی") as demo:
gr.Markdown("""
## مقایسه عملکرد مدل‌های OCR برای زبان فارسی
این برنامه سه مدل مختلف OCR را روی تصاویر فارسی مقایسه می‌کند:
1. PaddleOCR
2. TrOCR (مایکروسافت)
3. Tesseract OCR
""")
with gr.Row():
with gr.Column():
image_input = gr.Image(label="تصویر ورودی", type="pil")
submit_btn = gr.Button("مقایسه مدل‌ها")
with gr.Column():
comparison_output = gr.HTML(label="نتایج مقایسه")
paddle_output = gr.Textbox(label="PaddleOCR")
trocr_output = gr.Textbox(label="TrOCR")
tesseract_output = gr.Textbox(label="Tesseract")
submit_btn.click(
fn=compare_models,
inputs=image_input,
outputs=[comparison_output, paddle_output, trocr_output, tesseract_output]
)
if __name__ == "__main__":
demo.launch()