Spaces:

spark-ds549
/

Chinese-Label-Transcription

Sleeping

App Files Files Community

mkaramb commited on Apr 18, 2024

Commit

a559490

verified ·

1 Parent(s): abc2103

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -31

app.py CHANGED Viewed

@@ -1,58 +1,75 @@
-import gradio as gr
-import os
 import pandas as pd
 from google.cloud import documentai_v1 as documentai
 from google.cloud.documentai_v1.types import RawDocument
 from google.cloud import translate_v2 as translate
-from google.api_core.client_options import ClientOptions
 import zipfile
 import io
-import os
-# Upload credential json file from default compute service account
-os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"
 # Set your Google Cloud Document AI processor details here
-project_id = "herbaria-ai"
-location = "us"
-processor_id = "4307b078717a399a"
 def translate_text(text, target_language="en"):
     translate_client = translate.Client()
     result = translate_client.translate(text, target_language=target_language)
     return result["translatedText"]
-def process_image(file):
-    try:
-        # Process the document directly from the file-like object
-        extracted_text, translated_text = batch_process_documents(file, "image/jpeg")
-        return extracted_text, translated_text
-    except Exception as e:
-        return f"An error occurred: {str(e)}", ""
-def batch_process_documents(file, file_mime_type: str) -> tuple:
-    opts = documentai.ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
     client = documentai.DocumentProcessorServiceClient(client_options=opts)
-    # Read the file content directly from the file-like object
-    raw_document = RawDocument(content=file.read(), mime_type=file_mime_type)
     name = client.processor_path(project_id, location, processor_id)
     request = documentai.ProcessRequest(name=name, raw_document=raw_document)
     result = client.process_document(request=request)
     extracted_text = result.document.text
     translated_text = translate_text(extracted_text)
     return extracted_text, translated_text
-iface = gr.Interface(
-    fn=process_image,
-    inputs=gr.inputs.File(label="Upload Image File"),
-    outputs=[
-        gr.outputs.Textbox(label="Extracted Text"),
-        gr.outputs.Textbox(label="Translated Text")
-    ]
-)
-iface.launch()
 # def greet(name):

 import pandas as pd
+import gradio as gr
+from google.api_core.client_options import ClientOptions
 from google.cloud import documentai_v1 as documentai
 from google.cloud.documentai_v1.types import RawDocument
 from google.cloud import translate_v2 as translate
 import zipfile
+import os
 import io
+# Global DataFrame declaration
+results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])
 # Set your Google Cloud Document AI processor details here
+project_id = "your-gcp-project-id"
+location = "your-gcp-location"
+processor_id = "your-processor-id"
 def translate_text(text, target_language="en"):
     translate_client = translate.Client()
     result = translate_client.translate(text, target_language=target_language)
     return result["translatedText"]
+def batch_process_documents(file_stream, file_mime_type="image/jpeg") -> tuple:
+    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
     client = documentai.DocumentProcessorServiceClient(client_options=opts)
+    raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)
     name = client.processor_path(project_id, location, processor_id)
     request = documentai.ProcessRequest(name=name, raw_document=raw_document)
     result = client.process_document(request=request)
     extracted_text = result.document.text
     translated_text = translate_text(extracted_text)
     return extracted_text, translated_text
+def find_images(directory, extensions=('.jpeg', '.jpg')):
+    for root, _, filenames in os.walk(directory):
+        for filename in filenames:
+            if filename.lower().endswith(extensions) and not filename.startswith('.'):
+                yield os.path.join(root, filename)
+def process_zip_file(file_info):
+    global results_df
+    results_df = results_df.iloc[0:0]  # Clear the DataFrame if re-running this cell
+    with zipfile.ZipFile(io.BytesIO(file_info["content"]), 'r') as zip_ref:
+        zip_ref.extractall("extracted_files")
+    image_files = list(find_images("extracted_files"))
+    for file_path in image_files:
+        try:
+            extracted_text, translated_text = batch_process_documents(open(file_path, "rb"))
+            new_row = pd.DataFrame([{
+                "Filename": os.path.basename(file_path),
+                "Extracted Text": extracted_text,
+                "Translated Text": translated_text
+            }])
+            results_df = pd.concat([results_df, new_row], ignore_index=True)
+        except Exception as e:
+            return f"An error occurred while processing {file_path}: {e}"
+    return results_df
+interface = gr.Interface(fn=process_zip_file,
+                         inputs=gr.File(label="Upload ZIP File"),
+                         outputs=gr.Dataframe(label="Processed Results"),
+                         title="Document Processing and Translation")
+interface.launch()
 # def greet(name):