Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import os | |
import pickle | |
import torch | |
import markdown | |
from weasyprint import HTML, CSS | |
import io | |
from io import BytesIO | |
from grobidmonkey import reader | |
import nltk | |
nltk.download('punkt') | |
nltk.download('punkt_tab') | |
from nltk.tokenize import sent_tokenize | |
from transformers import pipeline | |
from transformers import BartTokenizer, BartModel, BartForConditionalGeneration | |
from transformers import T5Tokenizer, T5ForConditionalGeneration | |
from document import Document | |
from BartSE import BARTAutoEncoder | |
st.title('Paper2Slides') | |
st.markdown(""" | |
This space is a live demo of the [Zehao Lu](https://www.linkedin.com/in/zehao-lu/)’s [thesis](https://studenttheses.uu.nl/handle/20.500.12932/45939) | |
at Utrecht University (and internship project at [ML6](https://www.ml6.eu/)), | |
supervised by [Guanyi Chen](https://a-quei.github.io/) (During his time in Utrecht University) and | |
[Konstantin Buschmer](https://www.linkedin.com/in/konstantin-buschmeier/) (ML6). | |
To use this space: | |
1. Have a paper that you want to turn into slides. | |
2. Process your paper using GROBID. If you have GROBID installed, run it and use the output. If not, you can use GROBID’s [live demo](https://kermitt2-grobid.hf.space/) | |
to generate the processed TEI.xml file. To use the live demo, click on `TEI`, select `Process Fulltext Document` under Service to call, choose the paper file, and then | |
click `submit`. | |
""") | |
st.image("grobidmanual.gif") | |
st.markdown("### Now let's try **Paper2Slides**!") | |
st.markdown(""" | |
To use this space, you need to: | |
1. Set the number of slides you want to generate. | |
2. Update the processed `tei.xml` file. | |
""") | |
st.subheader('Set slide numbers') | |
st.markdown("Specify the range of slide numbers you want to generate.") | |
range_values = st.slider( | |
'Select a range', | |
min_value=0, | |
max_value=100, | |
value=(0, 25) | |
) | |
def save_uploaded_file(uploaded_file): | |
file_path = os.path.join("./uploads", uploaded_file.name) | |
os.makedirs("./uploads", exist_ok=True) # Create 'uploads' directory if it doesn't exist | |
with open(file_path, "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
return file_path # Return the file path as a string | |
st.subheader('Upload paper in TEI.xml format') | |
col1, col2 = st.columns([3, 1]) | |
with col1: | |
uploaded_file = st.file_uploader("Choose a file") | |
with col2: | |
option = st.selectbox( | |
'Select parsing method.', | |
('monkey', 'x2d', 'lxml')) | |
summ_text = None | |
if (uploaded_file is not None) and (not 'generation_done' in st.session_state): | |
st.write(uploaded_file.name) | |
bytes_data = uploaded_file.getvalue() | |
st.write(len(bytes_data), "bytes") | |
saved_file_path = save_uploaded_file(uploaded_file) | |
monkeyReader = reader.MonkeyReader(option) | |
# read paper content | |
essay = monkeyReader.readEssay(saved_file_path) | |
with st.status("Understanding paper...\nThis might take a while, feel free to grab a coffee!"): | |
Barttokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') | |
summ_model_path = 'com3dian/Bart-large-paper2slides-summarizer' | |
summarizor = BartForConditionalGeneration.from_pretrained(summ_model_path) | |
exp_model_path = 'com3dian/Bart-large-paper2slides-expander' | |
expandor = BartForConditionalGeneration.from_pretrained(exp_model_path) | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
BartSE = BARTAutoEncoder(summarizor, summarizor, device) | |
del summarizor, expandor | |
document = Document(essay, Barttokenizer) | |
del Barttokenizer | |
length = document.merge(range_values[0],range_values[1], BartSE, device) | |
with st.status("Generating slides...\nThey'll be ready shortly!"): | |
summarizor = pipeline("summarization", model=summ_model_path, device = device) | |
title_list = document.segmentation['key'] | |
summ_text = summarizor(document.segmentation['text'], max_length=100, min_length=10, do_sample=False) | |
summ_text = [text['summary_text'] for text in summ_text] | |
st.session_state.generation_done = True | |
if (summ_text is not None) or ('summ_text' in st.session_state): | |
# Function to render HTML content | |
def format(title_list, text_list): | |
format_list = [] | |
for index, text in enumerate(text_list): | |
title = "## " + title_list[index] + "\n" | |
# Split text into sentences using nltk's sent_tokenize | |
sentences = sent_tokenize(text) | |
# Create HTML list items | |
list_items = "".join([f"- {sentence.strip()}\n" for sentence in sentences if sentence.strip()]) | |
format_list.append(title + list_items) | |
return format_list | |
# Initialize session state for page index and text | |
if 'page_index' not in st.session_state: | |
st.session_state.page_index = 0 | |
if 'summ_text' not in st.session_state: | |
st.session_state.summ_text = format(title_list, summ_text) | |
if 'current_text' not in st.session_state: | |
st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index] | |
# Function to handle page turn | |
def turn_page(direction): | |
if direction == "next" and st.session_state.page_index < len(st.session_state.summ_text) - 1: | |
st.session_state.page_index += 1 | |
elif direction == "prev" and st.session_state.page_index > 0: | |
st.session_state.page_index -= 1 | |
st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index] | |
# Function to update the current text based on text_area changes | |
def update_text(): | |
st.session_state.summ_text[st.session_state.page_index] = st.session_state.text_area_value | |
st.session_state.current_text = st.session_state.text_area_value | |
st.subheader('Generated slides content') | |
# Display editable text box | |
text = st.text_area("Edit Text", st.session_state.current_text, height=200, key="text_area_value", on_change=update_text) | |
# # Display the framed text area | |
# # st.markdown('<div class="framed-text-area">', unsafe_allow_html=True) | |
# # text = st.text_area( | |
# # "Edit Text", | |
# # st.session_state.current_text, | |
# # height=200, | |
# # key="text_area_value", | |
# # on_change=update_text | |
# # ) | |
# # st.markdown('</div>', unsafe_allow_html=True) | |
# # Define custom CSS | |
# custom_css = """ | |
# <style> | |
# .framed-text-area { | |
# border: 2px solid #000000; | |
# border-radius: 5px; | |
# padding: 10px; | |
# margin: 10px 0; | |
# } | |
# .framed-text-area .stTextArea { | |
# border: none; | |
# } | |
# </style> | |
# """ | |
# # Inject custom CSS | |
# st.markdown(custom_css, unsafe_allow_html=True) | |
# # Create a container with the custom class | |
# st.markdown('<div class="framed-text-area">', unsafe_allow_html=True) | |
# # Your existing text area | |
# text = st.text_area( | |
# "Edit Text", | |
# st.session_state.current_text, | |
# height=200, | |
# key="slide_text_area_value", | |
# on_change=update_text | |
# ) | |
# # Close the container | |
# st.markdown('</div>', unsafe_allow_html=True) | |
# Display page turner controls | |
col1, col2, col3 = st.columns([2.25, 12, 1.7]) | |
# Previous button in col1 | |
with col1: | |
st.button("Previous", on_click=turn_page, args=("prev",)) | |
# Center aligned text in col2 | |
with col2: | |
st.markdown( | |
f'<div style="display: flex; justify-content: center; align-items: center; height: 100%;">' | |
f'Page {st.session_state.page_index + 1} of {len(st.session_state.summ_text)}' | |
f'</div>', | |
unsafe_allow_html=True | |
) | |
# Next button in col3, right aligned | |
with col3: | |
st.button("Next", on_click=turn_page, args=("next",)) | |
# Display HTML box | |
# st.markdown(st.session_state.current_text) | |
# CSS styling to create a frame | |
frame_css = """ | |
<style> | |
.framed-markdown { | |
border: 2px solid #a2a3a2; /* Border color */ | |
padding: 10px; /* Space inside the border */ | |
border-radius: 5px; /* Rounded corners */ | |
background-color: transparent;; /* Background color */ | |
margin: 10px 0; /* Margin around the frame */ | |
} | |
</style> | |
""" | |
# Inject CSS into the Streamlit app | |
st.markdown(frame_css, unsafe_allow_html=True) | |
def render_markdown_to_html(markdown_str): | |
return markdown.markdown(markdown_str) | |
# Render the markdown content within the framed box | |
st.markdown( | |
f'<div class="framed-markdown">{render_markdown_to_html(st.session_state.current_text)}</div>', | |
unsafe_allow_html=True | |
) | |
def generate_pdf(html_string): | |
css = """ | |
@page { | |
size: 1920px 1080px; /* Set page size to Full HD resolution */ | |
margin: 0; /* Remove all margins */ | |
} | |
body { | |
font-family: sans-serif; | |
background-color: #45474B; /* Set background color to grey */ | |
margin: 0; /* Remove body margin */ | |
padding: 0; /* Remove body padding */ | |
} | |
.content { | |
background-color: #45474B; /* Ensure the background color spans the full page */ | |
color: #F5F7F8; /* Set font color to white */ | |
padding: 20mm; /* Set padding to create text margins */ | |
box-sizing: border-box; /* Include padding in the element's total width and height */ | |
} | |
.page { | |
font-size: 32pt; /* Adjust the font size as needed */ | |
margin: 0; /* Remove margin from page content */ | |
padding: 0; /* Remove padding from page content */ | |
} | |
""" | |
pdf = BytesIO() | |
HTML(string=html_string).write_pdf(pdf, stylesheets=[CSS(string=css)]) | |
pdf.seek(0) | |
return pdf | |
def create_pdf_from_markdown_strings(markdown_strings): | |
html_pages = [render_markdown_to_html(md) for md in markdown_strings] | |
# Combine HTML content with page breaks and add a style section for font size, margins, background color, and font color | |
combined_html = ''' | |
<html> | |
<head> | |
<style> | |
.page { | |
font-size: 32pt; /* Adjust the font size as needed */ | |
margin: 0; /* Remove margin from page content */ | |
padding: 0; /* Remove padding from page content */ | |
} | |
</style> | |
</head> | |
<body> | |
''' | |
for i, page in enumerate(html_pages): | |
combined_html += f'<div class="content"><div class="page">{page}</div></div>' | |
if i < len(html_pages) - 1: # Only add page break if it's not the last page | |
combined_html += '<div style="page-break-after: always;"></div>' | |
combined_html += '</body></html>' | |
return combined_html | |
html_content = create_pdf_from_markdown_strings(st.session_state.summ_text) | |
pdf_file = generate_pdf(html_content) | |
st.write("\n\n\n") | |
# Provide download link | |
st.download_button( | |
label="Download PDF", | |
data=pdf_file, | |
file_name="slides.pdf", | |
mime="application/pdf" | |
) | |
st.markdown(""" | |
----------------------------------------- | |
Great! Thank you for using this huggingface space.\n | |
If you want to know more about this application, you can take a look at the [paper](https://studenttheses.uu.nl/handle/20.500.12932/45939).\n | |
To contact the author you can send an email to [email protected];\n | |
To cite the paper you can use Bibtex\n | |
``` | |
@mastersthesis{lu2024unsupervised, | |
title={Unsupervised Paper2Slides Generation}, | |
author={Lu, Zehao}, | |
year={2024} | |
}\n | |
```\n | |
To see how was the grobid's output is parsed, check [Grobidmonkey](https://github.com/com3dian/Grobidmonkey). | |
""") | |