import gradio as gr from gradio_rich_textbox import RichTextbox from PIL import Image from surya.ocr import run_ocr from surya.model.detection.segformer import load_model as load_det_model, load_processor as load_det_processor from surya.model.recognition.model import load_model as load_rec_model from surya.model.recognition.processor import load_processor as load_rec_processor # from lang_list import TEXT_SOURCE_LANGUAGE_NAMES from gradio_client import Client from dotenv import load_dotenv import requests from io import BytesIO import cohere import os import re import pandas as pd title = "# Welcome to AyaTonic" description = "Learn a New Language With Aya" # Load environment variables load_dotenv() COHERE_API_KEY = os.getenv('CO_API_KEY') SEAMLESSM4T = os.getenv('SEAMLESSM4T') df = pd.read_csv("lang_list.csv") inputlanguage = "" producetext = "\n\nProduce a complete expositional blog post in {target_language} based on the above :" formatinputstring = "\n\nthe above text is a learning aid. you must use rich text format to rewrite the above and add 1 . a red color tags for nouns 2. a blue color tag for verbs 3. a green color tag for adjectives and adverbs:" # Regular expression patterns for each color patterns = { "red": r'(.*?)', "blue": r'(.*?)', "green": r'(.*?)', } # Dictionaries to hold the matches matches = { "red": [], "blue": [], "green": [], } class TaggedPhraseExtractor: def __init__(self, text=''): self.text = text self.patterns = {} def set_text(self, text): """Set the text to search within.""" self.text = text def add_pattern(self, color, pattern): """Add a new color and its associated pattern.""" self.patterns[color] = pattern def extract_phrases(self): """Extract phrases for all colors and patterns added.""" matches = {color: re.findall(pattern, self.text) for color, pattern in self.patterns.items()} return matches def print_phrases(self): """Extract phrases and print them.""" matches = self.extract_phrases() for color, phrases in matches.items(): print(f"Phrases with color {color}:") for phrase in phrases: print(f"- {phrase}") print() co = cohere.Client(COHERE_API_KEY) audio_client = Client(SEAMLESSM4T) client = Client(SEAMLESSM4T) def process_audio_to_text(audio_path, inputlanguage="English"): """ Convert audio input to text using the Gradio client. """ audio_client = Client(SEAMLESSM4T) result = audio_client.predict( audio_path, inputlanguage, inputlanguage, api_name="/s2tt" ) print("Audio Result: ", result) return result[0] def process_text_to_audio(text, translatefrom, translateto): """ Convert text input to audio using the Gradio client. """ audio_client = Client(SEAMLESSM4T) result = audio_client.predict( text, translatefrom, translateto, api_name="/t2st" ) return result[0] class OCRProcessor: def __init__(self, langs=["en"]): #add input language code self.langs = langs self.det_processor, self.det_model = load_det_processor(), load_det_model() self.rec_model, self.rec_processor = load_rec_model(), load_rec_processor() def process_image(self, image): """ Process a PIL image and return the OCR text. """ predictions = run_ocr([image], [self.langs], self.det_model, self.det_processor, self.rec_model, self.rec_processor) return predictions[0] # Assuming the first item in predictions contains the desired text def process_pdf(self, pdf_path): """ Process a PDF file and return the OCR text. """ predictions = run_ocr([pdf_path], [self.langs], self.det_model, self.det_processor, self.rec_model, self.rec_processor) return predictions[0] # Assuming the first item in predictions contains the desired text def process_input(image=None, file=None, audio=None, text="", translateto = "English", translatefrom = "English" ): ocr_processor = OCRProcessor() final_text = text if image is not None: ocr_prediction = ocr_processor.process_image(image) # gettig text from ocr object for idx in range(len((list(ocr_prediction)[0][1]))): final_text += " " final_text += list((list(ocr_prediction)[0][1])[idx])[1][1] if file is not None: if file.name.lower().endswith(('.png', '.jpg', '.jpeg')): pil_image = Image.open(file) ocr_prediction = ocr_processor.process_image(pil_image) # gettig text from ocr object for idx in range(len((list(ocr_prediction)[0][1]))): final_text += " " final_text += list((list(ocr_prediction)[0][1])[idx])[1][1] elif file.name.lower().endswith('.pdf'): ocr_prediction = ocr_processor.process_pdf(file.name) # gettig text from ocr object for idx in range(len((list(ocr_prediction)[0][1]))): final_text += " " final_text += list((list(ocr_prediction)[0][1])[idx])[1][1] else: final_text += "\nUnsupported file type." print("OCR Text: ", final_text) if audio is not None: audio_text = process_audio_to_text(audio) final_text += "\n" + audio_text final_text_with_producetext = final_text + producetext response = co.generate( model='c4ai-aya', prompt=final_text_with_producetext, max_tokens=1024, temperature=0.5 ) # add graceful handling for errors (overflow) generated_text = response.generations[0].text print("Generated Text: ", generated_text) generated_text_with_format = generated_text + "\n" + formatinputstring response = co.generate( model='command-nightly', prompt=generated_text_with_format, max_tokens=4000, temperature=0.5 ) processed_text = response.generations[0].text audio_output = process_text_to_audio(processed_text, translateto, translateto) return processed_text, audio_output def main(): with gr.Blocks() as demo: gr.Markdown(title) gr.Markdown(description) with gr.Row(): input_language = gr.Dropdown(choices=df["name"].to_list(), label="Your Native Language") target_language = gr.Dropdown(choices=df["name"].to_list(), label="Language To Learn") with gr.Accordion("Talk To 🌟AyaTonic"): with gr.Tab("🤙🏻Audio & Text"): audio_input = gr.Audio(sources="microphone", type="filepath", label="Mic Input") text_input = gr.Textbox(lines=2, label="Text Input") with gr.Tab("📸Image & File"): image_input = gr.Image(type="pil", label="Camera Input") file_input = gr.File(label="File Upload") process_button = gr.Button("🌟AyaTonic") processed_text_output = RichTextbox(label="Processed Text") audio_output = gr.Audio(label="Audio Output") process_button.click( fn=process_input, inputs=[image_input, file_input, audio_input, text_input, input_language, target_language], outputs=[processed_text_output, audio_output] ) if __name__ == "__main__": main()