|
import pandas as pd |
|
import gradio as gr |
|
from google.api_core.client_options import ClientOptions |
|
from google.cloud import documentai_v1 as documentai |
|
from google.cloud.documentai_v1.types import RawDocument |
|
from google.cloud import translate_v2 as translate |
|
import zipfile |
|
import os |
|
import io |
|
|
|
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json" |
|
|
|
|
|
results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"]) |
|
|
|
|
|
project_id = "herbaria-ai" |
|
location = "us" |
|
processor_id = "4307b078717a399a" |
|
|
|
def translate_text(text, target_language="en"): |
|
translate_client = translate.Client() |
|
result = translate_client.translate(text, target_language=target_language) |
|
return result["translatedText"] |
|
|
|
def batch_process_documents(file_stream, file_mime_type="image/jpeg") -> tuple: |
|
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com") |
|
client = documentai.DocumentProcessorServiceClient(client_options=opts) |
|
raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type) |
|
|
|
name = client.processor_path(project_id, location, processor_id) |
|
request = documentai.ProcessRequest(name=name, raw_document=raw_document) |
|
result = client.process_document(request=request) |
|
|
|
extracted_text = result.document.text |
|
translated_text = translate_text(extracted_text) |
|
return extracted_text, translated_text |
|
|
|
def find_images(directory, extensions=('.jpeg', '.jpg')): |
|
for root, _, filenames in os.walk(directory): |
|
for filename in filenames: |
|
if filename.lower().endswith(extensions) and not filename.startswith('.'): |
|
yield os.path.join(root, filename) |
|
|
|
def process_zip_file(file_info): |
|
global results_df |
|
results_df = results_df.iloc[0:0] |
|
|
|
with zipfile.ZipFile(io.BytesIO(file_info["content"]), 'r') as zip_ref: |
|
zip_ref.extractall("extracted_files") |
|
|
|
image_files = list(find_images("extracted_files")) |
|
|
|
for file_path in image_files: |
|
try: |
|
extracted_text, translated_text = batch_process_documents(open(file_path, "rb")) |
|
new_row = pd.DataFrame([{ |
|
"Filename": os.path.basename(file_path), |
|
"Extracted Text": extracted_text, |
|
"Translated Text": translated_text |
|
}]) |
|
results_df = pd.concat([results_df, new_row], ignore_index=True) |
|
except Exception as e: |
|
return f"An error occurred while processing {file_path}: {e}" |
|
|
|
return results_df |
|
|
|
interface = gr.Interface(fn=process_zip_file, |
|
inputs=gr.File(label="Upload ZIP File"), |
|
outputs=gr.Dataframe(label="Processed Results"), |
|
title="Document Processing and Translation") |
|
|
|
interface.launch() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|