import streamlit as st import PyPDF2 import PyPDF2 as PDF from PyPDF2 import PdfReader from transformers import AutoModelForSeq2SeqLM, AutoTokenizer from transformers import MBartForConditionalGeneration, MBart50TokenizerFast def main(): st.title("Translation App") # Load model and tokenizer model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt") tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX") # Input option: Text area or file upload input_option = st.radio("Select Input Option", ("Text", "PDF")) if input_option == "Text": input_text = st.text_area("Enter text to translate", "") translate_button = st.button("Translate") if translate_button: translated_text = translate_text(input_text, model, tokenizer) st.write("Translated Text:") st.write(translated_text) elif input_option == "PDF": pdf_file = st.file_uploader("Upload PDF file", type=['pdf']) if pdf_file is not None: pdf_text = extract_text_from_pdf(pdf_file) st.write("Extracted Text from PDF:") st.write(pdf_text) translate_button = st.button("Translate") if translate_button: translated_text = translate_text(pdf_text, model, tokenizer) st.write("Translated Text:") st.write(translated_text) def extract_text_from_pdf(pdf_file): pdf_reader = PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() return text def translate_text(input_text, model, tokenizer): input_ids = tokenizer(input_text, return_tensors="pt").input_ids translate_to = st.selectbox("Select language to translate", ("Hindi", "Tamil", "Telugu")) target_lang = "" if translate_to == "Hindi": target_lang = "hi_IN" elif translate_to == "Tamil": target_lang = "ta_IN" elif translate_to == "Telugu": target_lang = "te_IN" generated_tokens = model.generate( input_ids=input_ids, forced_bos_token_id=tokenizer.lang_code_to_id[target_lang] ) translated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True) return translated_text if __name__ == '__main__': main()