File size: 2,401 Bytes
83c1b5b
851606d
83c1b5b
3e35f55
83c1b5b
 
 
 
 
3e35f55
 
83c1b5b
851606d
 
83c1b5b
851606d
 
 
 
 
 
 
 
 
 
 
 
 
9f43ee3
851606d
 
 
 
 
9f43ee3
851606d
 
 
 
 
 
 
83c1b5b
851606d
3e35f55
020eac5
851606d
 
 
 
 
 
 
 
 
dfb8d4c
020eac5
9f43ee3
020eac5
dfb8d4c
83c1b5b
 
 
851606d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import streamlit as st
import PyPDF2
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

def main():
    st.title("Translation App")

    # Load model and tokenizer
    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
    tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")

    # Input option: Text area or file upload
    input_option = st.radio("Select Input Option", ("Text", "PDF"))

    if input_option == "Text":
        input_text = st.text_area("Enter text to translate", "")
        translate_button = st.button("Translate")
        if translate_button:
            translated_text = translate_text(input_text, model, tokenizer)
            st.write("Translated Text:")
            st.write(translated_text)
    elif input_option == "PDF":
        pdf_file = st.file_uploader("Upload PDF file", type=['pdf'])
        if pdf_file is not None:
            pdf_text = extract_text_from_pdf(pdf_file)
            st.write("Extracted Text from PDF:")
            st.write(pdf_text)

            translate_button = st.button("Translate")
            if translate_button:
                translated_text = translate_text(pdf_text, model, tokenizer)
                st.write("Translated Text:")
                st.write(translated_text)

def extract_text_from_pdf(pdf_file):
    pdf_reader = PyPDF2.PdfFileReader(pdf_file)
    text = ""
    for page_num in range(pdf_reader.numPages):
        page = pdf_reader.getPage(page_num)
        text += page.extractText()
    return text

def translate_text(input_text, model, tokenizer):
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    
    translate_to = st.selectbox("Select language to translate", ("Hindi", "Tamil", "Telugu"))
    target_lang = ""
    if translate_to == "Hindi":
        target_lang = "hi_IN"
    elif translate_to == "Tamil":
        target_lang = "ta_IN"
    elif translate_to == "Telugu":
        target_lang = "te_IN"

    generated_tokens = model.generate(
        input_ids=input_ids,
        forced_bos_token_id=tokenizer.lang_code_to_id[target_lang]
    )
    translated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    return translated_text

if __name__ == '__main__':
    main()