Spaces:
Sleeping
Sleeping
import streamlit as st | |
import PyPDF2 | |
import PyPDF2 as PDF | |
from PyPDF2 import PdfReader | |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast | |
def main(): | |
st.title("Translation App") | |
# Load model and tokenizer | |
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt") | |
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX") | |
# Input option: Text area or file upload | |
input_option = st.radio("Select Input Option", ("Text", "PDF")) | |
if input_option == "Text": | |
input_text = st.text_area("Enter text to translate", "") | |
translate_button = st.button("Translate") | |
if translate_button: | |
translated_text = translate_text(input_text, model, tokenizer) | |
st.write("Translated Text:") | |
st.write(translated_text) | |
elif input_option == "PDF": | |
pdf_file = st.file_uploader("Upload PDF file", type=['pdf']) | |
if pdf_file is not None: | |
pdf_text = extract_text_from_pdf(pdf_file) | |
st.write("Extracted Text from PDF:") | |
st.write(pdf_text) | |
translate_button = st.button("Translate") | |
if translate_button: | |
translated_text = translate_text(pdf_text, model, tokenizer) | |
st.write("Translated Text:") | |
st.write(translated_text) | |
def extract_text_from_pdf(pdf_file): | |
pdf_reader = PdfReader(pdf_file) | |
text = "" | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
return text | |
def translate_text(input_text, model, tokenizer): | |
input_ids = tokenizer(input_text, return_tensors="pt").input_ids | |
translate_to = st.selectbox("Select language to translate", ("Hindi", "Tamil", "Telugu")) | |
target_lang = "" | |
if translate_to == "Hindi": | |
target_lang = "hi_IN" | |
elif translate_to == "Tamil": | |
target_lang = "ta_IN" | |
elif translate_to == "Telugu": | |
target_lang = "te_IN" | |
generated_tokens = model.generate( | |
input_ids=input_ids, | |
forced_bos_token_id=tokenizer.lang_code_to_id[target_lang] | |
) | |
translated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True) | |
return translated_text | |
if __name__ == '__main__': | |
main() | |