File size: 3,398 Bytes
4d57e5c 18cb325 7b79b85 4d57e5c eb822d4 4d57e5c 7b79b85 4d57e5c 37bfbd1 7b79b85 4d57e5c 7b79b85 c8197d8 4d57e5c 7b79b85 4d57e5c eb822d4 5884368 a22bf0a 18cb325 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import pandas as pd
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai
from google.cloud.documentai_v1.types import RawDocument
from google.cloud import translate_v2 as translate
import zipfile
import os
import io
import gradio as gr
# Global DataFrame declaration
results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])
# Set your Google Cloud Document AI processor details here
project_id = "herbaria-ai"
location = "us"
processor_id = "4307b078717a399a"
def translate_text(text, target_language="en"):
translate_client = translate.Client()
result = translate_client.translate(text, target_language=target_language)
return result["translatedText"]
def batch_process_documents(file_path: str, file_mime_type: str) -> tuple:
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=opts)
with open(file_path, "rb") as file_stream:
raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)
name = client.processor_path(project_id, location, processor_id)
request = documentai.ProcessRequest(name=name, raw_document=raw_document)
result = client.process_document(request=request)
extracted_text = result.document.text
translated_text = translate_text(extracted_text)
return extracted_text, translated_text
def unzip_and_find_jpgs(file_path):
extract_path = "extracted_files"
os.makedirs(extract_path, exist_ok=True)
jpg_files = []
with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.extractall(extract_path)
for root, dirs, files in os.walk(extract_path):
if '__MACOSX' in root:
continue
for file in files:
if file.lower().endswith('.jpg'):
full_path = os.path.join(root, file)
jpg_files.append(full_path)
return jpg_files
def process_images(uploaded_file):
global results_df
results_df = results_df.iloc[0:0] # Clear the DataFrame if re-running this cell
file_path = uploaded_file.name # Gradio provides the file path through the .name attribute
try:
image_files = unzip_and_find_jpgs(file_path)
if not image_files:
return "No JPG files found in the zip."
for file_path in image_files:
extracted_text, translated_text = batch_process_documents(file_path, "image/jpeg")
new_row = pd.DataFrame([{
"Filename": os.path.basename(file_path),
"Extracted Text": extracted_text,
"Translated Text": translated_text
}])
results_df = pd.concat([results_df, new_row], ignore_index=True)
except Exception as e:
return f"An error occurred: {str(e)}"
return results_df.to_html()
interface = gr.Interface(
fn=process_images,
inputs="file",
outputs="html",
title="Document AI Translation",
description="Upload a ZIP file containing JPEG/JPG images, and the system will extract and translate text from each image."
)
if __name__ == "__main__":
interface.launch(debug=True)
# def greet(name):
# return "Hello " + name + "!!"
#iface = gr.Interface(fn=greet, inputs="text", outputs="text")
#iface.launch()
|