Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import os | |
| import pickle | |
| import torch | |
| import markdown | |
| from weasyprint import HTML, CSS | |
| import io | |
| from io import BytesIO | |
| from grobidmonkey import reader | |
| from transformers import pipeline | |
| from transformers import BartTokenizer, BartModel, BartForConditionalGeneration | |
| from transformers import T5Tokenizer, T5ForConditionalGeneration | |
| from document import Document | |
| from BartSE import BARTAutoEncoder | |
| def save_uploaded_file(uploaded_file): | |
| file_path = os.path.join("./uploads", uploaded_file.name) | |
| os.makedirs("./uploads", exist_ok=True) # Create 'uploads' directory if it doesn't exist | |
| with open(file_path, "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| return file_path # Return the file path as a string | |
| st.title('Paper2Slides') | |
| st.subheader('Upload paper in pdf format') | |
| col1, col2 = st.columns([3, 1]) | |
| with col1: | |
| uploaded_file = st.file_uploader("Choose a file") | |
| with col2: | |
| option = st.selectbox( | |
| 'Select parsing method.', | |
| ('monkey', 'x2d', 'lxml')) | |
| range_values = st.slider( | |
| 'Select a range of values', | |
| min_value=0, | |
| max_value=100, | |
| value=(0, 25) | |
| ) | |
| summ_text = None | |
| if (uploaded_file is not None) and (not 'generation_done' in st.session_state): | |
| st.write(uploaded_file.name) | |
| bytes_data = uploaded_file.getvalue() | |
| st.write(len(bytes_data), "bytes") | |
| saved_file_path = save_uploaded_file(uploaded_file) | |
| monkeyReader = reader.MonkeyReader(option) | |
| # read paper content | |
| essay = monkeyReader.readEssay(saved_file_path) | |
| with st.status("Understanding paper..."): | |
| Barttokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') | |
| summ_model_path = 'com3dian/Bart-large-paper2slides-summarizer' | |
| summarizor = BartForConditionalGeneration.from_pretrained(summ_model_path) | |
| exp_model_path = 'com3dian/Bart-large-paper2slides-expander' | |
| expandor = BartForConditionalGeneration.from_pretrained(exp_model_path) | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| BartSE = BARTAutoEncoder(summarizor, summarizor, device) | |
| del summarizor, expandor | |
| document = Document(essay, Barttokenizer) | |
| del Barttokenizer | |
| length = document.merge(range_values[0],range_values[1], BartSE, device) | |
| with st.status("Generating slides..."): | |
| summarizor = pipeline("summarization", model=summ_model_path, device = device) | |
| title_list = document.segmentation['key'] | |
| summ_text = summarizor(document.segmentation['text'], max_length=100, min_length=10, do_sample=False) | |
| summ_text = [text['summary_text'] for text in summ_text] | |
| st.session_state.generation_done = True | |
| if (summ_text is not None) or ('summ_text' in st.session_state): | |
| # Function to render HTML content | |
| def format(title_list, text_list): | |
| format_list = [] | |
| for index, text in enumerate(text_list): | |
| title = "## " + title_list[index] + "\n" | |
| # Split text by periods | |
| sentences = text.split('.') | |
| # Create HTML list items | |
| list_items = "".join([f"- {sentence.strip()}.\n" for sentence in sentences if sentence.strip()]) | |
| format_list.append(title + list_items) | |
| return format_list | |
| # Initialize session state for page index and text | |
| if 'page_index' not in st.session_state: | |
| st.session_state.page_index = 0 | |
| if 'summ_text' not in st.session_state: | |
| st.session_state.summ_text = format(title_list, summ_text) | |
| if 'current_text' not in st.session_state: | |
| st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index] | |
| # Function to handle page turn | |
| def turn_page(direction): | |
| if direction == "next" and st.session_state.page_index < len(st.session_state.summ_text) - 1: | |
| st.session_state.page_index += 1 | |
| elif direction == "prev" and st.session_state.page_index > 0: | |
| st.session_state.page_index -= 1 | |
| st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index] | |
| # Function to update the current text based on text_area changes | |
| def update_text(): | |
| st.session_state.summ_text[st.session_state.page_index] = st.session_state.text_area_value | |
| st.session_state.current_text = st.session_state.text_area_value | |
| # Display editable text box | |
| text = st.text_area("Edit Text", st.session_state.current_text, height=200, key="text_area_value", on_change=update_text) | |
| # Display page turner controls | |
| col1, col2, col3 = st.columns([2.25, 12, 1.7]) | |
| # Previous button in col1 | |
| with col1: | |
| st.button("Previous", on_click=turn_page, args=("prev",)) | |
| # Center aligned text in col2 | |
| with col2: | |
| st.markdown( | |
| f'<div style="display: flex; justify-content: center; align-items: center; height: 100%;">' | |
| f'Page {st.session_state.page_index + 1} of {len(st.session_state.summ_text)}' | |
| f'</div>', | |
| unsafe_allow_html=True | |
| ) | |
| # Next button in col3, right aligned | |
| with col3: | |
| st.button("Next", on_click=turn_page, args=("next",)) | |
| # Display HTML box | |
| st.markdown(st.session_state.current_text) | |
| def render_markdown_to_html(markdown_str): | |
| return markdown.markdown(markdown_str) | |
| def create_pdf_from_markdown_strings(markdown_strings): | |
| html_pages = [render_markdown_to_html(md) for md in markdown_strings] | |
| # Combine HTML content with page breaks and add a style section for font size, margins, and background color | |
| combined_html = ''' | |
| <html> | |
| <head> | |
| <style> | |
| body { | |
| background-color: #45474B; /* Set background color to grey */ | |
| color: #F5F7F8; /* Set font color to white */ | |
| } | |
| .page { | |
| font-size: 16pt; /* Adjust the font size as needed */ | |
| margin: 1cm; /* Optional: adjust margins */ | |
| color: #F5F7F8; /* Ensure font color is white within pages */ | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| ''' | |
| for i, page in enumerate(html_pages): | |
| combined_html += f'<div class="page">{page}</div>' | |
| if i < len(html_pages) - 1: # Only add page break if it's not the last page | |
| combined_html += '<div style="page-break-after: always;"></div>' | |
| combined_html += '</body></html>' | |
| # PDF options: landscape orientation and page size | |
| options = { | |
| 'page-width': '297mm', # Width of A4 page in landscape mode | |
| 'page-height': '210mm', # Height of A4 page in landscape mode | |
| 'orientation': 'Landscape' | |
| } | |
| return combined_html, options | |
| def generate_pdf(html_string): | |
| css = """ | |
| @page { | |
| size: A4 landscape; | |
| margin: 20mm; | |
| } | |
| body { | |
| font-family: sans-serif; | |
| } | |
| """ | |
| pdf = BytesIO() | |
| HTML(string=html_string).write_pdf(pdf, stylesheets=[CSS(string=css)]) | |
| pdf.seek(0) | |
| return pdf | |
| html_content = create_pdf_from_markdown_strings(st.session_state.summ_text) | |
| pdf_file = generate_pdf(html_content) | |
| # Provide download link | |
| st.download_button( | |
| label="Download PDF", | |
| data=pdf_file, | |
| file_name="slides.pdf", | |
| mime="application/pdf" | |
| ) | |