File size: 3,070 Bytes
18cb325
a559490
 
18cb325
 
 
 
a559490
18cb325
5884368
8bea076
 
a559490
 
a1491f4
18cb325
8bea076
 
 
18cb325
 
 
 
 
 
a559490
 
18cb325
a559490
 
18cb325
 
 
a559490
18cb325
 
 
 
a559490
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5884368
 
18cb325
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
import gradio as gr
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai
from google.cloud.documentai_v1.types import RawDocument
from google.cloud import translate_v2 as translate
import zipfile
import os
import io

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"

# Global DataFrame declaration
results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])

# Set your Google Cloud Document AI processor details here
project_id = "herbaria-ai"
location = "us"
processor_id = "4307b078717a399a"

def translate_text(text, target_language="en"):
    translate_client = translate.Client()
    result = translate_client.translate(text, target_language=target_language)
    return result["translatedText"]

def batch_process_documents(file_stream, file_mime_type="image/jpeg") -> tuple:
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    client = documentai.DocumentProcessorServiceClient(client_options=opts)
    raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)

    name = client.processor_path(project_id, location, processor_id)
    request = documentai.ProcessRequest(name=name, raw_document=raw_document)
    result = client.process_document(request=request)

    extracted_text = result.document.text
    translated_text = translate_text(extracted_text)
    return extracted_text, translated_text

def find_images(directory, extensions=('.jpeg', '.jpg')):
    for root, _, filenames in os.walk(directory):
        for filename in filenames:
            if filename.lower().endswith(extensions) and not filename.startswith('.'):
                yield os.path.join(root, filename)

def process_zip_file(file_info):
    global results_df
    results_df = results_df.iloc[0:0]  # Clear the DataFrame if re-running this cell

    with zipfile.ZipFile(io.BytesIO(file_info["content"]), 'r') as zip_ref:
        zip_ref.extractall("extracted_files")

    image_files = list(find_images("extracted_files"))

    for file_path in image_files:
        try:
            extracted_text, translated_text = batch_process_documents(open(file_path, "rb"))
            new_row = pd.DataFrame([{
                "Filename": os.path.basename(file_path),
                "Extracted Text": extracted_text,
                "Translated Text": translated_text
            }])
            results_df = pd.concat([results_df, new_row], ignore_index=True)
        except Exception as e:
            return f"An error occurred while processing {file_path}: {e}"

    return results_df

interface = gr.Interface(fn=process_zip_file,
                         inputs=gr.File(label="Upload ZIP File"),
                         outputs=gr.Dataframe(label="Processed Results"),
                         title="Document Processing and Translation")

interface.launch()



# def greet(name):
   # return "Hello " + name + "!!"

#iface = gr.Interface(fn=greet, inputs="text", outputs="text")
#iface.launch()