import streamlit as st import pandas as pd import numpy as np import os import pickle import torch import markdown import pdfkit import io from grobidmonkey import reader from transformers import pipeline from transformers import BartTokenizer, BartModel, BartForConditionalGeneration from transformers import T5Tokenizer, T5ForConditionalGeneration from document import Document from BartSE import BARTAutoEncoder def save_uploaded_file(uploaded_file): file_path = os.path.join("./uploads", uploaded_file.name) os.makedirs("./uploads", exist_ok=True) # Create 'uploads' directory if it doesn't exist with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) return file_path # Return the file path as a string st.title('Paper2Slides') st.subheader('Upload paper in pdf format') # col1, col2 = st.columns([3, 1]) # with col1: # uploaded_file = st.file_uploader("Choose a file") # with col2: # option = st.selectbox( # 'Select parsing method.', # ('monkey', 'x2d', 'lxml')) # if uploaded_file is not None: # st.write(uploaded_file.name) # bytes_data = uploaded_file.getvalue() # st.write(len(bytes_data), "bytes") # saved_file_path = save_uploaded_file(uploaded_file) # monkeyReader = reader.MonkeyReader(option) # outline = monkeyReader.readOutline(saved_file_path) # for pre, fill, node in outline: # st.write("%s%s" % (pre, node.name)) # # read paper content # essay = monkeyReader.readEssay(saved_file_path) # with st.status("Understanding paper..."): # Barttokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') # summ_model_path = 'com3dian/Bart-large-paper2slides-summarizer' # summarizor = BartForConditionalGeneration.from_pretrained(summ_model_path) # exp_model_path = 'com3dian/Bart-large-paper2slides-expander' # expandor = BartForConditionalGeneration.from_pretrained(exp_model_path) # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # BartSE = BARTAutoEncoder(summarizor, summarizor, device) # del summarizor, expandor # document = Document(essay, Barttokenizer) # del Barttokenizer # length = document.merge(25, 30, BartSE, device) # with st.status("Generating slides..."): # summarizor = pipeline("summarization", model=summ_model_path, device = device) # summ_text = summarizor(document.segmentation['text'], max_length=100, min_length=10, do_sample=False) # summ_text = [text['summary_text'] for text in summ_text] # for summ in summ_text: # st.write(summ) with open('slides_text.pkl', 'rb') as file: summ_text = pickle.load(file) # Function to render HTML content def format(text_list): format_list = [] for text in text_list: # Split text by periods sentences = text.split('.') # Create HTML list items list_items = "".join([f"- {sentence.strip()}.\n" for sentence in sentences if sentence.strip()]) format_list.append(list_items) return format_list # Initialize session state for page index and text if 'page_index' not in st.session_state: st.session_state.page_index = 0 if 'summ_text' not in st.session_state: st.session_state.summ_text = format(summ_text) if 'current_text' not in st.session_state: st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index] # Function to handle page turn def turn_page(direction): if direction == "next" and st.session_state.page_index < len(summ_text) - 1: st.session_state.page_index += 1 elif direction == "prev" and st.session_state.page_index > 0: st.session_state.page_index -= 1 st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index] # Function to update the current text based on text_area changes def update_text(): st.session_state.summ_text[st.session_state.page_index] = st.session_state.text_area_value st.session_state.current_text = st.session_state.text_area_value # Display editable text box text = st.text_area("Edit Text", st.session_state.current_text, height=200, key="text_area_value", on_change=update_text) # Display page turner controls col1, col2, col3 = st.columns([2.25, 12, 1.7]) # Previous button in col1 with col1: st.button("Previous", on_click=turn_page, args=("prev",)) # Center aligned text in col2 with col2: st.markdown( f'
' f'Page {st.session_state.page_index + 1} of {len(summ_text)}' f'
', unsafe_allow_html=True ) # Next button in col3, right aligned with col3: st.button("Next", on_click=turn_page, args=("next",)) # Display HTML box st.markdown(st.session_state.current_text) def render_markdown_to_html(markdown_str): return markdown.markdown(markdown_str) def create_pdf_from_markdown_strings(markdown_strings): html_pages = [render_markdown_to_html(md) for md in markdown_strings] # Combine HTML content with page breaks and add a style section for font size and margins combined_html = ''' ''' for i, page in enumerate(html_pages): combined_html += f'
{page}
' if i < len(html_pages) - 1: # Only add page break after if it's not the last page combined_html += '
' combined_html += '' # PDF options: landscape orientation and page size options = { 'page-size': 'A4', 'orientation': 'Landscape' } return html_pages def html_to_pdf(html_content): # Create a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: pdfkit.from_string(html_content, tmp_file.name) return tmp_file.name # Convert combined HTML to PDF directly into a file pdfkit.from_string(combined_html, output_file, options=options) html_content = create_pdf_from_markdown_strings(st.session_state.summ_text) if st.button("Download PDF"): if html_content: pdf_path = html_to_pdf(html_content) with open(pdf_path, "rb") as pdf_file: st.download_button( label="Download PDF", data=pdf_file, file_name="converted.pdf", mime="application/pdf" ) # Remove the temporary file os.remove(pdf_path)