tahirsher commited on
Commit
464541c
·
verified ·
1 Parent(s): 00053f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -25
app.py CHANGED
@@ -3,40 +3,56 @@ import PyPDF2
3
  import docx2txt
4
  from transformers import pipeline
5
 
6
- # Hugging Face translation pipeline
7
- translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en") # Multilingual to English
8
- translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-ur") # Multilingual to Urdu
 
 
 
9
 
10
  def extract_text_from_pdf(file):
11
- """Extract text from a PDF file."""
12
- pdf_reader = PyPDF2.PdfReader(file)
13
  text = ""
14
- for page in pdf_reader.pages:
15
- text += page.extract_text()
 
 
 
 
16
  return text
17
 
18
  def extract_text_from_word(file):
19
- """Extract text from a Word file."""
20
- return docx2txt.process(file)
 
 
 
 
21
 
22
- def translate_text(text, target_language):
23
- """Translate text to the selected language."""
24
- if target_language == "English":
25
- return translator_en(text[:500]) if text else "No text found"
26
- elif target_language == "Urdu":
27
- return translator_ur(text[:500]) if text else "No text found"
28
- return "Invalid translation choice."
 
 
 
 
 
 
 
29
 
30
  # Streamlit UI
31
- st.title("Multilingual Document Translator")
32
- st.write("Translate PDF or Word documents to English and Urdu quickly.")
33
 
34
- # File uploader
35
  uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
36
- target_language = st.radio("Select the target language for translation", ["English", "Urdu"])
37
 
38
  if uploaded_file:
39
- # Extract text
40
  if uploaded_file.name.endswith(".pdf"):
41
  text_content = extract_text_from_pdf(uploaded_file)
42
  else:
@@ -46,12 +62,15 @@ if uploaded_file:
46
  st.subheader("Extracted Text (Preview)")
47
  st.write(text_content[:500] if text_content else "No content found in the file.")
48
 
49
- # Perform translation
50
  if st.button("Translate"):
51
  if text_content:
52
  st.subheader(f"Translated Text ({target_language})")
53
- translation_results = translate_text(text_content, target_language)
54
- translations = "\n".join([result['translation_text'] for result in translation_results])
55
- st.text_area("Translation Output", translations, height=300)
 
 
 
56
  else:
57
  st.warning("No text found to translate. Please upload a valid document.")
 
3
  import docx2txt
4
  from transformers import pipeline
5
 
6
+ # Initialize Hugging Face Translation Pipelines (Force PyTorch Backend)
7
+ try:
8
+ translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
9
+ translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-ur", framework="pt")
10
+ except Exception as e:
11
+ st.error(f"Failed to initialize translation models. Error: {e}")
12
 
13
  def extract_text_from_pdf(file):
14
+ """Extract text from PDF."""
 
15
  text = ""
16
+ try:
17
+ pdf_reader = PyPDF2.PdfReader(file)
18
+ for page in pdf_reader.pages:
19
+ text += page.extract_text()
20
+ except Exception as e:
21
+ st.error(f"Error extracting text from PDF: {e}")
22
  return text
23
 
24
  def extract_text_from_word(file):
25
+ """Extract text from Word file."""
26
+ try:
27
+ return docx2txt.process(file)
28
+ except Exception as e:
29
+ st.error(f"Error extracting text from Word document: {e}")
30
+ return ""
31
 
32
+ def translate_text(text, translator):
33
+ """Translate text in chunks using the given translator."""
34
+ max_chunk_size = 512 # Limit due to token constraints
35
+ text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
36
+ translations = []
37
+
38
+ for chunk in text_chunks:
39
+ try:
40
+ result = translator(chunk)
41
+ translations.append(result[0]['translation_text'])
42
+ except Exception as e:
43
+ st.error(f"Error during translation: {e}")
44
+ return ""
45
+ return " ".join(translations)
46
 
47
  # Streamlit UI
48
+ st.title("📚 Multilingual Document Translator")
49
+ st.write("Translate PDF or Word documents to English and Urdu effortlessly!")
50
 
 
51
  uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
52
+ target_language = st.radio("Select target language for translation", ["English", "Urdu"])
53
 
54
  if uploaded_file:
55
+ # Extract text from the uploaded file
56
  if uploaded_file.name.endswith(".pdf"):
57
  text_content = extract_text_from_pdf(uploaded_file)
58
  else:
 
62
  st.subheader("Extracted Text (Preview)")
63
  st.write(text_content[:500] if text_content else "No content found in the file.")
64
 
65
+ # Perform translation when the user clicks the button
66
  if st.button("Translate"):
67
  if text_content:
68
  st.subheader(f"Translated Text ({target_language})")
69
+ if target_language == "English":
70
+ translated_text = translate_text(text_content, translator_en)
71
+ else:
72
+ translated_text = translate_text(text_content, translator_ur)
73
+
74
+ st.text_area("Translation Output", translated_text, height=300)
75
  else:
76
  st.warning("No text found to translate. Please upload a valid document.")