File size: 2,422 Bytes
83c1b5b
851606d
e3f0b46
83c1b5b
3e35f55
83c1b5b
 
 
 
 
3e35f55
 
83c1b5b
851606d
 
83c1b5b
851606d
 
 
 
 
 
 
 
 
 
6608f42
851606d
6608f42
9f43ee3
851606d
 
 
 
 
9f43ee3
851606d
 
 
 
 
 
 
83c1b5b
851606d
3e35f55
020eac5
851606d
 
 
 
 
 
 
 
 
dfb8d4c
020eac5
9f43ee3
020eac5
dfb8d4c
83c1b5b
 
 
851606d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import streamlit as st
import PyPDF2
import PyPDF2 as PDF
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

def main():
    st.title("Translation App")

    # Load model and tokenizer
    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
    tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")

    # Input option: Text area or file upload
    input_option = st.radio("Select Input Option", ("Text", "PDF"))

    if input_option == "Text":
        input_text = st.text_area("Enter text to translate", "")
        translate_button = st.button("Translate")
        if translate_button:
            translated_text = translate_text(input_text, model, tokenizer)
            st.write("Translated Text:")
            st.write(translated_text)
    elif input_option == "PDF":
        pdf_file = st.file_uploader("Upload PDF file", type=['pdf'])
        if pdf_file is not None:
            pdf_text = extract_text_from_pdf(pdf_file)
            st.write("Extracted Text from PDF:")
            st.write(pdf_text)

            translate_button = st.button("Translate")
            if translate_button:
                translated_text = translate_text(pdf_text, model, tokenizer)
                st.write("Translated Text:")
                st.write(translated_text)

def extract_text_from_pdf(pdf_file):
    pdf_reader = PyPDF2.PdfFileReader(pdf_file)
    text = ""
    for page_num in range(pdf_reader.numPages):
        page = pdf_reader.getPage(page_num)
        text += page.extractText()
    return text

def translate_text(input_text, model, tokenizer):
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    
    translate_to = st.selectbox("Select language to translate", ("Hindi", "Tamil", "Telugu"))
    target_lang = ""
    if translate_to == "Hindi":
        target_lang = "hi_IN"
    elif translate_to == "Tamil":
        target_lang = "ta_IN"
    elif translate_to == "Telugu":
        target_lang = "te_IN"

    generated_tokens = model.generate(
        input_ids=input_ids,
        forced_bos_token_id=tokenizer.lang_code_to_id[target_lang]
    )
    translated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    return translated_text

if __name__ == '__main__':
    main()