Update app.py
Browse files
app.py
CHANGED
@@ -1,58 +1,75 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import os
|
3 |
import pandas as pd
|
|
|
|
|
4 |
from google.cloud import documentai_v1 as documentai
|
5 |
from google.cloud.documentai_v1.types import RawDocument
|
6 |
from google.cloud import translate_v2 as translate
|
7 |
-
from google.api_core.client_options import ClientOptions
|
8 |
import zipfile
|
|
|
9 |
import io
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"
|
14 |
|
15 |
# Set your Google Cloud Document AI processor details here
|
16 |
-
project_id = "
|
17 |
-
location = "
|
18 |
-
processor_id = "
|
19 |
|
20 |
def translate_text(text, target_language="en"):
|
21 |
translate_client = translate.Client()
|
22 |
result = translate_client.translate(text, target_language=target_language)
|
23 |
return result["translatedText"]
|
24 |
|
25 |
-
def
|
26 |
-
|
27 |
-
# Process the document directly from the file-like object
|
28 |
-
extracted_text, translated_text = batch_process_documents(file, "image/jpeg")
|
29 |
-
return extracted_text, translated_text
|
30 |
-
except Exception as e:
|
31 |
-
return f"An error occurred: {str(e)}", ""
|
32 |
-
|
33 |
-
def batch_process_documents(file, file_mime_type: str) -> tuple:
|
34 |
-
opts = documentai.ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
|
35 |
client = documentai.DocumentProcessorServiceClient(client_options=opts)
|
36 |
-
|
37 |
-
|
38 |
-
raw_document = RawDocument(content=file.read(), mime_type=file_mime_type)
|
39 |
name = client.processor_path(project_id, location, processor_id)
|
40 |
request = documentai.ProcessRequest(name=name, raw_document=raw_document)
|
41 |
result = client.process_document(request=request)
|
|
|
42 |
extracted_text = result.document.text
|
43 |
translated_text = translate_text(extracted_text)
|
44 |
return extracted_text, translated_text
|
45 |
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
-
iface.launch()
|
56 |
|
57 |
|
58 |
# def greet(name):
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
+
import gradio as gr
|
3 |
+
from google.api_core.client_options import ClientOptions
|
4 |
from google.cloud import documentai_v1 as documentai
|
5 |
from google.cloud.documentai_v1.types import RawDocument
|
6 |
from google.cloud import translate_v2 as translate
|
|
|
7 |
import zipfile
|
8 |
+
import os
|
9 |
import io
|
10 |
|
11 |
+
# Global DataFrame declaration
|
12 |
+
results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])
|
|
|
13 |
|
14 |
# Set your Google Cloud Document AI processor details here
|
15 |
+
project_id = "your-gcp-project-id"
|
16 |
+
location = "your-gcp-location"
|
17 |
+
processor_id = "your-processor-id"
|
18 |
|
19 |
def translate_text(text, target_language="en"):
|
20 |
translate_client = translate.Client()
|
21 |
result = translate_client.translate(text, target_language=target_language)
|
22 |
return result["translatedText"]
|
23 |
|
24 |
+
def batch_process_documents(file_stream, file_mime_type="image/jpeg") -> tuple:
|
25 |
+
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
client = documentai.DocumentProcessorServiceClient(client_options=opts)
|
27 |
+
raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)
|
28 |
+
|
|
|
29 |
name = client.processor_path(project_id, location, processor_id)
|
30 |
request = documentai.ProcessRequest(name=name, raw_document=raw_document)
|
31 |
result = client.process_document(request=request)
|
32 |
+
|
33 |
extracted_text = result.document.text
|
34 |
translated_text = translate_text(extracted_text)
|
35 |
return extracted_text, translated_text
|
36 |
|
37 |
+
def find_images(directory, extensions=('.jpeg', '.jpg')):
|
38 |
+
for root, _, filenames in os.walk(directory):
|
39 |
+
for filename in filenames:
|
40 |
+
if filename.lower().endswith(extensions) and not filename.startswith('.'):
|
41 |
+
yield os.path.join(root, filename)
|
42 |
+
|
43 |
+
def process_zip_file(file_info):
|
44 |
+
global results_df
|
45 |
+
results_df = results_df.iloc[0:0] # Clear the DataFrame if re-running this cell
|
46 |
+
|
47 |
+
with zipfile.ZipFile(io.BytesIO(file_info["content"]), 'r') as zip_ref:
|
48 |
+
zip_ref.extractall("extracted_files")
|
49 |
+
|
50 |
+
image_files = list(find_images("extracted_files"))
|
51 |
+
|
52 |
+
for file_path in image_files:
|
53 |
+
try:
|
54 |
+
extracted_text, translated_text = batch_process_documents(open(file_path, "rb"))
|
55 |
+
new_row = pd.DataFrame([{
|
56 |
+
"Filename": os.path.basename(file_path),
|
57 |
+
"Extracted Text": extracted_text,
|
58 |
+
"Translated Text": translated_text
|
59 |
+
}])
|
60 |
+
results_df = pd.concat([results_df, new_row], ignore_index=True)
|
61 |
+
except Exception as e:
|
62 |
+
return f"An error occurred while processing {file_path}: {e}"
|
63 |
+
|
64 |
+
return results_df
|
65 |
+
|
66 |
+
interface = gr.Interface(fn=process_zip_file,
|
67 |
+
inputs=gr.File(label="Upload ZIP File"),
|
68 |
+
outputs=gr.Dataframe(label="Processed Results"),
|
69 |
+
title="Document Processing and Translation")
|
70 |
+
|
71 |
+
interface.launch()
|
72 |
|
|
|
73 |
|
74 |
|
75 |
# def greet(name):
|