Spaces:

spark-ds549
/

Chinese-Label-Transcription

Sleeping

App Files Files Community

Chinese-Label-Transcription / app.py

mkaramb

Update app.py

8bea076 verified about 1 year ago

raw

history blame

3.07 kB

	import pandas as pd
	import gradio as gr
	from google.api_core.client_options import ClientOptions
	from google.cloud import documentai_v1 as documentai
	from google.cloud.documentai_v1.types import RawDocument
	from google.cloud import translate_v2 as translate
	import zipfile
	import os
	import io

	os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"

	# Global DataFrame declaration
	results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])

	# Set your Google Cloud Document AI processor details here
	project_id = "herbaria-ai"
	location = "us"
	processor_id = "4307b078717a399a"

	def translate_text(text, target_language="en"):
	translate_client = translate.Client()
	result = translate_client.translate(text, target_language=target_language)
	return result["translatedText"]

	def batch_process_documents(file_stream, file_mime_type="image/jpeg") -> tuple:
	opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
	client = documentai.DocumentProcessorServiceClient(client_options=opts)
	raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)

	name = client.processor_path(project_id, location, processor_id)
	request = documentai.ProcessRequest(name=name, raw_document=raw_document)
	result = client.process_document(request=request)

	extracted_text = result.document.text
	translated_text = translate_text(extracted_text)
	return extracted_text, translated_text

	def find_images(directory, extensions=('.jpeg', '.jpg')):
	for root, _, filenames in os.walk(directory):
	for filename in filenames:
	if filename.lower().endswith(extensions) and not filename.startswith('.'):
	yield os.path.join(root, filename)

	def process_zip_file(file_info):
	global results_df
	results_df = results_df.iloc[0:0] # Clear the DataFrame if re-running this cell

	with zipfile.ZipFile(io.BytesIO(file_info["content"]), 'r') as zip_ref:
	zip_ref.extractall("extracted_files")

	image_files = list(find_images("extracted_files"))

	for file_path in image_files:
	try:
	extracted_text, translated_text = batch_process_documents(open(file_path, "rb"))
	new_row = pd.DataFrame([{
	"Filename": os.path.basename(file_path),
	"Extracted Text": extracted_text,
	"Translated Text": translated_text
	}])
	results_df = pd.concat([results_df, new_row], ignore_index=True)
	except Exception as e:
	return f"An error occurred while processing {file_path}: {e}"

	return results_df

	interface = gr.Interface(fn=process_zip_file,
	inputs=gr.File(label="Upload ZIP File"),
	outputs=gr.Dataframe(label="Processed Results"),
	title="Document Processing and Translation")

	interface.launch()



	# def greet(name):
	# return "Hello " + name + "!!"

	#iface = gr.Interface(fn=greet, inputs="text", outputs="text")
	#iface.launch()