Spaces:
Runtime error
Runtime error
File size: 5,102 Bytes
27d1b5a 3c40c63 27d1b5a 3c40c63 27d1b5a e077868 3c40c63 ee2694e 3c40c63 ee2694e 27d1b5a 3c40c63 27d1b5a ee2694e 27d1b5a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import streamlit as st
import fitz
from transformers import pipeline, MBart50TokenizerFast, MBartForConditionalGeneration
from multiprocessing import Pool, cpu_count
import tempfile
# Load summarization pipeline
summarizer = pipeline("summarization", model="Falconsai/text_summarization")
# Load translation model and tokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")
# Define max chunk length
max_chunk_length = 1024
# Function to chunk text
def chunk_text(text, max_chunk_length):
chunks = []
current_chunk = ""
for sentence in text.split("."):
if len(current_chunk) + len(sentence) + 1 <= max_chunk_length:
if current_chunk != "":
current_chunk += " "
current_chunk += sentence.strip()
else:
chunks.append(current_chunk)
current_chunk = sentence.strip()
if current_chunk != "":
chunks.append(current_chunk)
return chunks
# Function to summarize and translate a chunk
def summarize_and_translate_chunk(chunk, lang):
summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)
summary_text = summary[0]['summary_text']
# Translate summary
translated_chunk = translate_summary(summary_text, lang)
return translated_chunk
# Function to translate the summary
def translate_summary(summary, lang):
# Chunk text if it exceeds maximum length
if len(summary) > max_chunk_length:
chunks = chunk_text(summary, max_chunk_length)
else:
chunks = [summary]
# Translate each chunk
translated_chunks = []
for chunk in chunks:
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)
generated_tokens = model.generate(
**inputs,
forced_bos_token_id=tokenizer.lang_code_to_id[lang],
max_length=1024,
num_beams=4,
early_stopping=True,
length_penalty=2.0,
)
translated_chunks.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
return " ".join(translated_chunks)
# Function to read PDF and summarize and translate chunk by chunk
def summarize_and_translate_pdf(uploaded_file, lang):
# Save uploaded PDF to a temporary file
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(uploaded_file.read())
temp_file_path = temp_file.name
try:
doc = fitz.open(temp_file_path)
except FileNotFoundError:
st.error("File not found. Please make sure the file path is correct.")
return []
total_chunks = len(doc)
chunks = []
for i in range(total_chunks):
page = doc.load_page(i)
text = page.get_text()
chunks.extend([text[j:j+max_chunk_length] for j in range(0, len(text), max_chunk_length)])
# Use multiprocessing to parallelize the process
with Pool(cpu_count()) as pool:
translated_chunks = pool.starmap(summarize_and_translate_chunk, [(chunk, lang) for chunk in chunks])
# Delete temporary file
os.unlink(temp_file_path)
return translated_chunks
# Streamlit UI
st.title("PDF Summarization and Translation")
# File upload
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
if uploaded_file:
# Display uploaded file
st.write("Uploaded PDF file:", uploaded_file.name)
# Language selection
languages = {
"Arabic": "ar_AR", "Czech": "cs_CZ", "German": "de_DE", "English": "en_XX", "Spanish": "es_XX",
"Estonian": "et_EE", "Finnish": "fi_FI", "French": "fr_XX", "Gujarati": "gu_IN", "Hindi": "hi_IN",
"Italian": "it_IT", "Japanese": "ja_XX", "Kazakh": "kk_KZ", "Korean": "ko_KR", "Lithuanian": "lt_LT",
"Latvian": "lv_LV", "Burmese": "my_MM", "Nepali": "ne_NP", "Dutch": "nl_XX", "Romanian": "ro_RO",
"Russian": "ru_RU", "Sinhala": "si_LK", "Turkish": "tr_TR", "Vietnamese": "vi_VN", "Chinese": "zh_CN",
"Afrikaans": "af_ZA", "Azerbaijani": "az_AZ", "Bengali": "bn_IN", "Persian": "fa_IR", "Hebrew": "he_IL",
"Croatian": "hr_HR", "Indonesian": "id_ID", "Georgian": "ka_GE", "Khmer": "km_KH", "Macedonian": "mk_MK",
"Malayalam": "ml_IN", "Mongolian": "mn_MN", "Marathi": "mr_IN", "Polish": "pl_PL", "Pashto": "ps_AF",
"Portuguese": "pt_XX", "Swedish": "sv_SE", "Swahili": "sw_KE", "Tamil": "ta_IN", "Telugu": "te_IN",
"Thai": "th_TH", "Tagalog": "tl_XX", "Ukrainian": "uk_UA", "Urdu": "ur_PK", "Xhosa": "xh_ZA",
"Galician": "gl_ES", "Slovene": "sl_SI"
}
lang = st.selectbox("Select language for translation", list(languages.keys()))
# Translate PDF
if st.button("Summarize and Translate"):
translated_chunks = summarize_and_translate_pdf(uploaded_file, languages[lang])
# Display translated text
st.header("Translated Summary")
for chunk in translated_chunks:
st.write(chunk)
|