Spaces:
Sleeping
Sleeping
File size: 2,401 Bytes
83c1b5b 851606d 83c1b5b 3e35f55 83c1b5b 3e35f55 83c1b5b 851606d 83c1b5b 851606d 9f43ee3 851606d 9f43ee3 851606d 83c1b5b 851606d 3e35f55 020eac5 851606d dfb8d4c 020eac5 9f43ee3 020eac5 dfb8d4c 83c1b5b 851606d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import streamlit as st
import PyPDF2
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
def main():
st.title("Translation App")
# Load model and tokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")
# Input option: Text area or file upload
input_option = st.radio("Select Input Option", ("Text", "PDF"))
if input_option == "Text":
input_text = st.text_area("Enter text to translate", "")
translate_button = st.button("Translate")
if translate_button:
translated_text = translate_text(input_text, model, tokenizer)
st.write("Translated Text:")
st.write(translated_text)
elif input_option == "PDF":
pdf_file = st.file_uploader("Upload PDF file", type=['pdf'])
if pdf_file is not None:
pdf_text = extract_text_from_pdf(pdf_file)
st.write("Extracted Text from PDF:")
st.write(pdf_text)
translate_button = st.button("Translate")
if translate_button:
translated_text = translate_text(pdf_text, model, tokenizer)
st.write("Translated Text:")
st.write(translated_text)
def extract_text_from_pdf(pdf_file):
pdf_reader = PyPDF2.PdfFileReader(pdf_file)
text = ""
for page_num in range(pdf_reader.numPages):
page = pdf_reader.getPage(page_num)
text += page.extractText()
return text
def translate_text(input_text, model, tokenizer):
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
translate_to = st.selectbox("Select language to translate", ("Hindi", "Tamil", "Telugu"))
target_lang = ""
if translate_to == "Hindi":
target_lang = "hi_IN"
elif translate_to == "Tamil":
target_lang = "ta_IN"
elif translate_to == "Telugu":
target_lang = "te_IN"
generated_tokens = model.generate(
input_ids=input_ids,
forced_bos_token_id=tokenizer.lang_code_to_id[target_lang]
)
translated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
return translated_text
if __name__ == '__main__':
main()
|