import streamlit as st import pandas as pd import numpy as np import os import pickle import torch import markdown from weasyprint import HTML, CSS import io from io import BytesIO from grobidmonkey import reader from transformers import pipeline from transformers import BartTokenizer, BartModel, BartForConditionalGeneration from transformers import T5Tokenizer, T5ForConditionalGeneration from document import Document from BartSE import BARTAutoEncoder st.title('Paper2Slides') st.markdown(""" This space is a live demo of the [Zehao Lu](https://www.linkedin.com/in/zehao-lu/)’s thesis at Utrecht University (and internship project at ML6), supervised by [Guanyi Chen](https://a-quei.github.io/) (During his time in Utrecht University) and [Konstantin Buschmer](https://www.linkedin.com/in/konstantin-buschmeier/?locale=de_DE) (ML6). To use this space: 1. Have a paper that you want to turn into slides. 2. Process your paper using GROBID. If you have GROBID installed, run it and use the output. If not, you can use GROBID’s [live demo](https://kermitt2-grobid.hf.space/) to generate the processed TEI.xml file. To use the live demo, click on `TEI`, select `Process Fulltext Document` under Service to call, choose the paper file, and then click `submit`. """) st.image("Screencastfrom23-09-24151531-ezgif.com-optimize.gif") st.markdown("Now let's start to use **Paper2Slides**!") st.subheader('Set slide numbers') st.markdown("Specify the range of slide numbers you want to generate.") range_values = st.slider( 'Select a range', min_value=0, max_value=100, value=(0, 25) ) def save_uploaded_file(uploaded_file): file_path = os.path.join("./uploads", uploaded_file.name) os.makedirs("./uploads", exist_ok=True) # Create 'uploads' directory if it doesn't exist with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) return file_path # Return the file path as a string st.subheader('Upload paper in TEI.xml format') col1, col2 = st.columns([3, 1]) with col1: uploaded_file = st.file_uploader("Choose a file") with col2: option = st.selectbox( 'Select parsing method.', ('monkey', 'x2d', 'lxml')) summ_text = None if (uploaded_file is not None) and (not 'generation_done' in st.session_state): st.write(uploaded_file.name) bytes_data = uploaded_file.getvalue() st.write(len(bytes_data), "bytes") saved_file_path = save_uploaded_file(uploaded_file) monkeyReader = reader.MonkeyReader(option) # read paper content essay = monkeyReader.readEssay(saved_file_path) with st.status("Understanding paper..."): Barttokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') summ_model_path = 'com3dian/Bart-large-paper2slides-summarizer' summarizor = BartForConditionalGeneration.from_pretrained(summ_model_path) exp_model_path = 'com3dian/Bart-large-paper2slides-expander' expandor = BartForConditionalGeneration.from_pretrained(exp_model_path) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") BartSE = BARTAutoEncoder(summarizor, summarizor, device) del summarizor, expandor document = Document(essay, Barttokenizer) del Barttokenizer length = document.merge(range_values[0],range_values[1], BartSE, device) with st.status("Generating slides..."): summarizor = pipeline("summarization", model=summ_model_path, device = device) title_list = document.segmentation['key'] summ_text = summarizor(document.segmentation['text'], max_length=100, min_length=10, do_sample=False) summ_text = [text['summary_text'] for text in summ_text] st.session_state.generation_done = True if (summ_text is not None) or ('summ_text' in st.session_state): # Function to render HTML content def format(title_list, text_list): format_list = [] for index, text in enumerate(text_list): title = "## " + title_list[index] + "\n" # Split text by periods sentences = text.split('.') # Create HTML list items list_items = "".join([f"- {sentence.strip()}.\n" for sentence in sentences if sentence.strip()]) format_list.append(title + list_items) return format_list # Initialize session state for page index and text if 'page_index' not in st.session_state: st.session_state.page_index = 0 if 'summ_text' not in st.session_state: st.session_state.summ_text = format(title_list, summ_text) if 'current_text' not in st.session_state: st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index] # Function to handle page turn def turn_page(direction): if direction == "next" and st.session_state.page_index < len(st.session_state.summ_text) - 1: st.session_state.page_index += 1 elif direction == "prev" and st.session_state.page_index > 0: st.session_state.page_index -= 1 st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index] # Function to update the current text based on text_area changes def update_text(): st.session_state.summ_text[st.session_state.page_index] = st.session_state.text_area_value st.session_state.current_text = st.session_state.text_area_value st.subheader('Generated slides content') # Display editable text box # text = st.text_area("Edit Text", st.session_state.current_text, height=200, key="text_area_value", on_change=update_text) st.markdown(""" """, unsafe_allow_html=True) # Display the framed text area st.markdown('
', unsafe_allow_html=True) text = st.text_area( "Edit Text", st.session_state.current_text, height=200, key="text_area_value", on_change=update_text ) st.markdown('
', unsafe_allow_html=True) # Display page turner controls col1, col2, col3 = st.columns([2.25, 12, 1.7]) # Previous button in col1 with col1: st.button("Previous", on_click=turn_page, args=("prev",)) # Center aligned text in col2 with col2: st.markdown( f'
' f'Page {st.session_state.page_index + 1} of {len(st.session_state.summ_text)}' f'
', unsafe_allow_html=True ) # Next button in col3, right aligned with col3: st.button("Next", on_click=turn_page, args=("next",)) # Display HTML box st.markdown(st.session_state.current_text) def render_markdown_to_html(markdown_str): return markdown.markdown(markdown_str) def generate_pdf(html_string): css = """ @page { size: 1920px 1080px; /* Set page size to Full HD resolution */ margin: 0; /* Remove all margins */ } body { font-family: sans-serif; background-color: #45474B; /* Set background color to grey */ margin: 0; /* Remove body margin */ padding: 0; /* Remove body padding */ } .content { background-color: #45474B; /* Ensure the background color spans the full page */ color: #F5F7F8; /* Set font color to white */ padding: 20mm; /* Set padding to create text margins */ box-sizing: border-box; /* Include padding in the element's total width and height */ } .page { font-size: 32pt; /* Adjust the font size as needed */ margin: 0; /* Remove margin from page content */ padding: 0; /* Remove padding from page content */ } """ pdf = BytesIO() HTML(string=html_string).write_pdf(pdf, stylesheets=[CSS(string=css)]) pdf.seek(0) return pdf def create_pdf_from_markdown_strings(markdown_strings): html_pages = [render_markdown_to_html(md) for md in markdown_strings] # Combine HTML content with page breaks and add a style section for font size, margins, background color, and font color combined_html = ''' ''' for i, page in enumerate(html_pages): combined_html += f'
{page}
' if i < len(html_pages) - 1: # Only add page break if it's not the last page combined_html += '
' combined_html += '' return combined_html html_content = create_pdf_from_markdown_strings(st.session_state.summ_text) pdf_file = generate_pdf(html_content) # Provide download link st.download_button( label="Download PDF", data=pdf_file, file_name="slides.pdf", mime="application/pdf" )