mkaramb commited on
Commit
a559490
·
verified ·
1 Parent(s): abc2103

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -31
app.py CHANGED
@@ -1,58 +1,75 @@
1
- import gradio as gr
2
- import os
3
  import pandas as pd
 
 
4
  from google.cloud import documentai_v1 as documentai
5
  from google.cloud.documentai_v1.types import RawDocument
6
  from google.cloud import translate_v2 as translate
7
- from google.api_core.client_options import ClientOptions
8
  import zipfile
 
9
  import io
10
 
11
- import os
12
- # Upload credential json file from default compute service account
13
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"
14
 
15
  # Set your Google Cloud Document AI processor details here
16
- project_id = "herbaria-ai"
17
- location = "us"
18
- processor_id = "4307b078717a399a"
19
 
20
  def translate_text(text, target_language="en"):
21
  translate_client = translate.Client()
22
  result = translate_client.translate(text, target_language=target_language)
23
  return result["translatedText"]
24
 
25
- def process_image(file):
26
- try:
27
- # Process the document directly from the file-like object
28
- extracted_text, translated_text = batch_process_documents(file, "image/jpeg")
29
- return extracted_text, translated_text
30
- except Exception as e:
31
- return f"An error occurred: {str(e)}", ""
32
-
33
- def batch_process_documents(file, file_mime_type: str) -> tuple:
34
- opts = documentai.ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
35
  client = documentai.DocumentProcessorServiceClient(client_options=opts)
36
-
37
- # Read the file content directly from the file-like object
38
- raw_document = RawDocument(content=file.read(), mime_type=file_mime_type)
39
  name = client.processor_path(project_id, location, processor_id)
40
  request = documentai.ProcessRequest(name=name, raw_document=raw_document)
41
  result = client.process_document(request=request)
 
42
  extracted_text = result.document.text
43
  translated_text = translate_text(extracted_text)
44
  return extracted_text, translated_text
45
 
46
- iface = gr.Interface(
47
- fn=process_image,
48
- inputs=gr.inputs.File(label="Upload Image File"),
49
- outputs=[
50
- gr.outputs.Textbox(label="Extracted Text"),
51
- gr.outputs.Textbox(label="Translated Text")
52
- ]
53
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- iface.launch()
56
 
57
 
58
  # def greet(name):
 
 
 
1
  import pandas as pd
2
+ import gradio as gr
3
+ from google.api_core.client_options import ClientOptions
4
  from google.cloud import documentai_v1 as documentai
5
  from google.cloud.documentai_v1.types import RawDocument
6
  from google.cloud import translate_v2 as translate
 
7
  import zipfile
8
+ import os
9
  import io
10
 
11
+ # Global DataFrame declaration
12
+ results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])
 
13
 
14
  # Set your Google Cloud Document AI processor details here
15
+ project_id = "your-gcp-project-id"
16
+ location = "your-gcp-location"
17
+ processor_id = "your-processor-id"
18
 
19
  def translate_text(text, target_language="en"):
20
  translate_client = translate.Client()
21
  result = translate_client.translate(text, target_language=target_language)
22
  return result["translatedText"]
23
 
24
+ def batch_process_documents(file_stream, file_mime_type="image/jpeg") -> tuple:
25
+ opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
 
 
 
 
 
 
 
 
26
  client = documentai.DocumentProcessorServiceClient(client_options=opts)
27
+ raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)
28
+
 
29
  name = client.processor_path(project_id, location, processor_id)
30
  request = documentai.ProcessRequest(name=name, raw_document=raw_document)
31
  result = client.process_document(request=request)
32
+
33
  extracted_text = result.document.text
34
  translated_text = translate_text(extracted_text)
35
  return extracted_text, translated_text
36
 
37
+ def find_images(directory, extensions=('.jpeg', '.jpg')):
38
+ for root, _, filenames in os.walk(directory):
39
+ for filename in filenames:
40
+ if filename.lower().endswith(extensions) and not filename.startswith('.'):
41
+ yield os.path.join(root, filename)
42
+
43
+ def process_zip_file(file_info):
44
+ global results_df
45
+ results_df = results_df.iloc[0:0] # Clear the DataFrame if re-running this cell
46
+
47
+ with zipfile.ZipFile(io.BytesIO(file_info["content"]), 'r') as zip_ref:
48
+ zip_ref.extractall("extracted_files")
49
+
50
+ image_files = list(find_images("extracted_files"))
51
+
52
+ for file_path in image_files:
53
+ try:
54
+ extracted_text, translated_text = batch_process_documents(open(file_path, "rb"))
55
+ new_row = pd.DataFrame([{
56
+ "Filename": os.path.basename(file_path),
57
+ "Extracted Text": extracted_text,
58
+ "Translated Text": translated_text
59
+ }])
60
+ results_df = pd.concat([results_df, new_row], ignore_index=True)
61
+ except Exception as e:
62
+ return f"An error occurred while processing {file_path}: {e}"
63
+
64
+ return results_df
65
+
66
+ interface = gr.Interface(fn=process_zip_file,
67
+ inputs=gr.File(label="Upload ZIP File"),
68
+ outputs=gr.Dataframe(label="Processed Results"),
69
+ title="Document Processing and Translation")
70
+
71
+ interface.launch()
72
 
 
73
 
74
 
75
  # def greet(name):