Spaces:

AlirezaF138
/

Persian-OCR

Running

File size: 2,925 Bytes

717b6b1
 
 
ebe3e23
717b6b1
 
 
ebe3e23
717b6b1
 
ebe3e23
 
 
 
 
 
 
 
 
 
717b6b1
ebe3e23
 
 
 
 
 
 
 
717b6b1
 
 
 
 
 
 
 
 
 
 
 
 
 
ebe3e23
 
717b6b1
 
 
 
 
ebe3e23
 
 
 
 
 
 
 
717b6b1
 
 
 
ebe3e23
717b6b1
ebe3e23
 
717b6b1

import gradio as gr
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import os

# Function to perform OCR and search for a keyword
def ocr_and_search(input_file, keyword, lang='fas'):  # 'fas': Persian language (Farsi)
    extracted_text = ""
    keyword_found = False
    
    # Check if the input file is a PDF or an image
    if isinstance(input_file, str) and input_file.endswith('.pdf'):  # Check if the file is a PDF
        # Convert PDF to images
        images = convert_from_path(input_file)
        
        # Loop through each image and perform OCR
        for page_number, image in enumerate(images):
            text = pytesseract.image_to_string(image, lang=lang)
            extracted_text += text

            # Check if the keyword is in the extracted text
            if keyword.lower() in text.lower():
                keyword_found = True
                
    elif isinstance(input_file, Image.Image):  # If the input is an image
        text = pytesseract.image_to_string(input_file, lang=lang)
        extracted_text = text
        
        # Check if the keyword is in the extracted text
        if keyword.lower() in text.lower():
            keyword_found = True

    if not keyword_found:
        result_message = f"Keyword '{keyword}' not found in the document."
    else:
        result_message = f"Keyword '{keyword}' found in the document."

    return extracted_text, result_message

# Create Gradio interface
def gradio_interface():
    # Define Gradio inputs and outputs
    input_type = gr.Radio(["PDF", "Image"], label="Choose Input Type", value="PDF")  # Option to choose file type
    file_input = gr.File(label="Upload PDF/Image")
    keyword_input = gr.Textbox(label="Enter Keyword", value="فلسفه")  # Default keyword is 'فلسفه'
    output_text = gr.Textbox(label="Extracted Text", interactive=False)
    output_message = gr.Textbox(label="Keyword Search Result", interactive=False)

    # Function to process the inputs and return the outputs
    def process(input_type, file, keyword):
        # Handle PDF and image accordingly
        if input_type == "PDF":
            extracted_text, result_message = ocr_and_search(file.name, keyword)
        else:  # Handle image input
            image = Image.open(file.name)  # Open image file
            extracted_text, result_message = ocr_and_search(image, keyword)
        
        return extracted_text, result_message

    # Create and launch Gradio interface
    gr.Interface(fn=process,
                 inputs=[input_type, file_input, keyword_input],
                 outputs=[output_text, output_message],
                 title="OCR Keyword Search (PDF/Image)",
                 description="Upload a PDF or Image, enter a keyword, and see the OCR results along with a search for the keyword."
                ).launch()

# Call the function to create the interface
gradio_interface()