Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import streamlit as st
|
|
|
2 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
3 |
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
|
4 |
|
@@ -9,43 +10,55 @@ def main():
|
|
9 |
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
|
10 |
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")
|
11 |
|
12 |
-
# Input
|
13 |
-
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
29 |
|
30 |
-
|
31 |
-
translated_text = translate_text(input_text, model, tokenizer, target_lang="te_IN")
|
32 |
-
st.write("Translated Text (Telugu):")
|
33 |
-
st.write(translated_text)
|
34 |
-
|
35 |
-
def translate_text(input_text, model, tokenizer, target_lang):
|
36 |
-
# Tokenize input text
|
37 |
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
|
38 |
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
generated_tokens = model.generate(
|
41 |
input_ids=input_ids,
|
42 |
forced_bos_token_id=tokenizer.lang_code_to_id[target_lang]
|
43 |
)
|
44 |
-
|
45 |
-
# Decode translated text
|
46 |
translated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
|
47 |
-
|
48 |
return translated_text
|
49 |
|
50 |
if __name__ == '__main__':
|
51 |
-
main()
|
|
|
1 |
import streamlit as st
|
2 |
+
import PyPDF2
|
3 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
4 |
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
|
5 |
|
|
|
10 |
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
|
11 |
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")
|
12 |
|
13 |
+
# Input option: Text area or file upload
|
14 |
+
input_option = st.radio("Select Input Option", ("Text", "PDF"))
|
15 |
|
16 |
+
if input_option == "Text":
|
17 |
+
input_text = st.text_area("Enter text to translate", "")
|
18 |
+
translate_button = st.button("Translate")
|
19 |
+
if translate_button:
|
20 |
+
translated_text = translate_text(input_text, model, tokenizer)
|
21 |
+
st.write("Translated Text:")
|
22 |
+
st.write(translated_text)
|
23 |
+
elif input_option == "PDF":
|
24 |
+
pdf_file = st.file_uploader("Upload PDF file", type=['pdf'])
|
25 |
+
if pdf_file is not None:
|
26 |
+
pdf_text = extract_text_from_pdf(pdf_file)
|
27 |
+
st.write("Extracted Text from PDF:")
|
28 |
+
st.write(pdf_text)
|
29 |
|
30 |
+
translate_button = st.button("Translate")
|
31 |
+
if translate_button:
|
32 |
+
translated_text = translate_text(pdf_text, model, tokenizer)
|
33 |
+
st.write("Translated Text:")
|
34 |
+
st.write(translated_text)
|
35 |
|
36 |
+
def extract_text_from_pdf(pdf_file):
|
37 |
+
pdf_reader = PyPDF2.PdfFileReader(pdf_file)
|
38 |
+
text = ""
|
39 |
+
for page_num in range(pdf_reader.numPages):
|
40 |
+
page = pdf_reader.getPage(page_num)
|
41 |
+
text += page.extractText()
|
42 |
+
return text
|
43 |
|
44 |
+
def translate_text(input_text, model, tokenizer):
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
|
46 |
|
47 |
+
translate_to = st.selectbox("Select language to translate", ("Hindi", "Tamil", "Telugu"))
|
48 |
+
target_lang = ""
|
49 |
+
if translate_to == "Hindi":
|
50 |
+
target_lang = "hi_IN"
|
51 |
+
elif translate_to == "Tamil":
|
52 |
+
target_lang = "ta_IN"
|
53 |
+
elif translate_to == "Telugu":
|
54 |
+
target_lang = "te_IN"
|
55 |
+
|
56 |
generated_tokens = model.generate(
|
57 |
input_ids=input_ids,
|
58 |
forced_bos_token_id=tokenizer.lang_code_to_id[target_lang]
|
59 |
)
|
|
|
|
|
60 |
translated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
|
|
|
61 |
return translated_text
|
62 |
|
63 |
if __name__ == '__main__':
|
64 |
+
main()
|