Spaces:
Sleeping
Sleeping
File size: 12,129 Bytes
ea5c59c 8d4620d 6f670c5 7ee71e2 b0eedcf 35fd393 b0eedcf 2341ee3 9ec3b13 b8609ad ea5c59c 0cbdeb5 bd0ad19 dff7979 45b30e7 dff7979 2135f24 7b0cbf1 25438e2 a28a21d dff7979 7b0cbf1 cc907b2 73e56ea f75d3f2 09a54a9 f75d3f2 e1d5db1 d43bd0b 1bf4865 d43bd0b cc907b2 d43bd0b cc907b2 8d4620d 28c51ee 8d4620d ea5c59c cc907b2 54f71b8 39f102c 7b8df24 0499963 c2c697a 0ed5911 f1dc184 39f102c 230d178 39f102c fda22ce 39f102c 0cbdeb5 e3748f8 e514b11 39f102c e514b11 39f102c 23cb72b 0cbdeb5 e3748f8 39f102c 48102c5 39f102c d459820 09f103b c2c697a c7f2062 b8609ad 48102c5 c7f2062 48102c5 2e52433 b8609ad c7f2062 b8609ad 48102c5 c7f2062 05ae92f c7f2062 3d2f753 c7f2062 9da7fce c7f2062 aa38316 6efd680 aa38316 6efd680 aa38316 6efd680 aa38316 6efd680 078e23b c7f2062 078e23b c7f2062 57fad31 c843614 ae440c0 57fad31 93cbec3 57fad31 92bfcdf 57fad31 c843614 92bfcdf c843614 c7f2062 96e96c5 53cd408 96e96c5 6d6a2fe 53cd408 0c871a1 195956d 0c871a1 96e96c5 1b6b526 53cd408 96e96c5 53cd408 96e96c5 6d6a2fe 53cd408 96e96c5 0c871a1 96e96c5 0c871a1 c7f2062 77962b1 c7f2062 b17cdba d8303d2 b8609ad edbb1df 9877514 b17cdba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 |
import streamlit as st
import pandas as pd
import numpy as np
import os
import pickle
import torch
import markdown
from weasyprint import HTML, CSS
import io
from io import BytesIO
from grobidmonkey import reader
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
from transformers import pipeline
from transformers import BartTokenizer, BartModel, BartForConditionalGeneration
from transformers import T5Tokenizer, T5ForConditionalGeneration
from document import Document
from BartSE import BARTAutoEncoder
st.title('Paper2Slides')
st.markdown("""
This space is a live demo of the [Zehao Lu](https://www.linkedin.com/in/zehao-lu/)’s [thesis](https://studenttheses.uu.nl/handle/20.500.12932/45939)
at Utrecht University (and internship project at [ML6](https://www.ml6.eu/)),
supervised by [Guanyi Chen](https://a-quei.github.io/) (During his time in Utrecht University) and
[Konstantin Buschmer](https://www.linkedin.com/in/konstantin-buschmeier/) (ML6).
To use this space:
1. Have a paper that you want to turn into slides.
2. Process your paper using GROBID. If you have GROBID installed, run it and use the output. If not, you can use GROBID’s [live demo](https://kermitt2-grobid.hf.space/)
to generate the processed TEI.xml file. To use the live demo, click on `TEI`, select `Process Fulltext Document` under Service to call, choose the paper file, and then
click `submit`.
""")
st.image("grobidmanual.gif")
st.markdown("### Now let's try **Paper2Slides**!")
st.markdown("""
To use this space, you need to:
1. Set the number of slides you want to generate.
2. Update the processed `tei.xml` file.
""")
st.subheader('Set slide numbers')
st.markdown("Specify the range of slide numbers you want to generate.")
range_values = st.slider(
'Select a range',
min_value=0,
max_value=100,
value=(0, 25)
)
def save_uploaded_file(uploaded_file):
file_path = os.path.join("./uploads", uploaded_file.name)
os.makedirs("./uploads", exist_ok=True) # Create 'uploads' directory if it doesn't exist
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
return file_path # Return the file path as a string
st.subheader('Upload paper in TEI.xml format')
col1, col2 = st.columns([3, 1])
with col1:
uploaded_file = st.file_uploader("Choose a file")
with col2:
option = st.selectbox(
'Select parsing method.',
('monkey', 'x2d', 'lxml'))
summ_text = None
if (uploaded_file is not None) and (not 'generation_done' in st.session_state):
st.write(uploaded_file.name)
bytes_data = uploaded_file.getvalue()
st.write(len(bytes_data), "bytes")
saved_file_path = save_uploaded_file(uploaded_file)
monkeyReader = reader.MonkeyReader(option)
# read paper content
essay = monkeyReader.readEssay(saved_file_path)
with st.status("Understanding paper...\nThis might take a while, feel free to grab a coffee!"):
Barttokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
summ_model_path = 'com3dian/Bart-large-paper2slides-summarizer'
summarizor = BartForConditionalGeneration.from_pretrained(summ_model_path)
exp_model_path = 'com3dian/Bart-large-paper2slides-expander'
expandor = BartForConditionalGeneration.from_pretrained(exp_model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BartSE = BARTAutoEncoder(summarizor, summarizor, device)
del summarizor, expandor
document = Document(essay, Barttokenizer)
del Barttokenizer
length = document.merge(range_values[0],range_values[1], BartSE, device)
with st.status("Generating slides...\nThey'll be ready shortly!"):
summarizor = pipeline("summarization", model=summ_model_path, device = device)
title_list = document.segmentation['key']
summ_text = summarizor(document.segmentation['text'], max_length=100, min_length=10, do_sample=False)
summ_text = [text['summary_text'] for text in summ_text]
st.session_state.generation_done = True
if (summ_text is not None) or ('summ_text' in st.session_state):
# Function to render HTML content
def format(title_list, text_list):
format_list = []
for index, text in enumerate(text_list):
title = "## " + title_list[index] + "\n"
# Split text into sentences using nltk's sent_tokenize
sentences = sent_tokenize(text)
# Create HTML list items
list_items = "".join([f"- {sentence.strip()}\n" for sentence in sentences if sentence.strip()])
format_list.append(title + list_items)
return format_list
# Initialize session state for page index and text
if 'page_index' not in st.session_state:
st.session_state.page_index = 0
if 'summ_text' not in st.session_state:
st.session_state.summ_text = format(title_list, summ_text)
if 'current_text' not in st.session_state:
st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index]
# Function to handle page turn
def turn_page(direction):
if direction == "next" and st.session_state.page_index < len(st.session_state.summ_text) - 1:
st.session_state.page_index += 1
elif direction == "prev" and st.session_state.page_index > 0:
st.session_state.page_index -= 1
st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index]
# Function to update the current text based on text_area changes
def update_text():
st.session_state.summ_text[st.session_state.page_index] = st.session_state.text_area_value
st.session_state.current_text = st.session_state.text_area_value
st.subheader('Generated slides content')
# Display editable text box
text = st.text_area("Edit Text", st.session_state.current_text, height=200, key="text_area_value", on_change=update_text)
# # Display the framed text area
# # st.markdown('<div class="framed-text-area">', unsafe_allow_html=True)
# # text = st.text_area(
# # "Edit Text",
# # st.session_state.current_text,
# # height=200,
# # key="text_area_value",
# # on_change=update_text
# # )
# # st.markdown('</div>', unsafe_allow_html=True)
# # Define custom CSS
# custom_css = """
# <style>
# .framed-text-area {
# border: 2px solid #000000;
# border-radius: 5px;
# padding: 10px;
# margin: 10px 0;
# }
# .framed-text-area .stTextArea {
# border: none;
# }
# </style>
# """
# # Inject custom CSS
# st.markdown(custom_css, unsafe_allow_html=True)
# # Create a container with the custom class
# st.markdown('<div class="framed-text-area">', unsafe_allow_html=True)
# # Your existing text area
# text = st.text_area(
# "Edit Text",
# st.session_state.current_text,
# height=200,
# key="slide_text_area_value",
# on_change=update_text
# )
# # Close the container
# st.markdown('</div>', unsafe_allow_html=True)
# Display page turner controls
col1, col2, col3 = st.columns([2.25, 12, 1.7])
# Previous button in col1
with col1:
st.button("Previous", on_click=turn_page, args=("prev",))
# Center aligned text in col2
with col2:
st.markdown(
f'<div style="display: flex; justify-content: center; align-items: center; height: 100%;">'
f'Page {st.session_state.page_index + 1} of {len(st.session_state.summ_text)}'
f'</div>',
unsafe_allow_html=True
)
# Next button in col3, right aligned
with col3:
st.button("Next", on_click=turn_page, args=("next",))
# Display HTML box
# st.markdown(st.session_state.current_text)
# CSS styling to create a frame
frame_css = """
<style>
.framed-markdown {
border: 2px solid #a2a3a2; /* Border color */
padding: 10px; /* Space inside the border */
border-radius: 5px; /* Rounded corners */
background-color: transparent;; /* Background color */
margin: 10px 0; /* Margin around the frame */
}
</style>
"""
# Inject CSS into the Streamlit app
st.markdown(frame_css, unsafe_allow_html=True)
def render_markdown_to_html(markdown_str):
return markdown.markdown(markdown_str)
# Render the markdown content within the framed box
st.markdown(
f'<div class="framed-markdown">{render_markdown_to_html(st.session_state.current_text)}</div>',
unsafe_allow_html=True
)
def generate_pdf(html_string):
css = """
@page {
size: 1920px 1080px; /* Set page size to Full HD resolution */
margin: 0; /* Remove all margins */
}
body {
font-family: sans-serif;
background-color: #45474B; /* Set background color to grey */
margin: 0; /* Remove body margin */
padding: 0; /* Remove body padding */
}
.content {
background-color: #45474B; /* Ensure the background color spans the full page */
color: #F5F7F8; /* Set font color to white */
padding: 20mm; /* Set padding to create text margins */
box-sizing: border-box; /* Include padding in the element's total width and height */
}
.page {
font-size: 32pt; /* Adjust the font size as needed */
margin: 0; /* Remove margin from page content */
padding: 0; /* Remove padding from page content */
}
"""
pdf = BytesIO()
HTML(string=html_string).write_pdf(pdf, stylesheets=[CSS(string=css)])
pdf.seek(0)
return pdf
def create_pdf_from_markdown_strings(markdown_strings):
html_pages = [render_markdown_to_html(md) for md in markdown_strings]
# Combine HTML content with page breaks and add a style section for font size, margins, background color, and font color
combined_html = '''
<html>
<head>
<style>
.page {
font-size: 32pt; /* Adjust the font size as needed */
margin: 0; /* Remove margin from page content */
padding: 0; /* Remove padding from page content */
}
</style>
</head>
<body>
'''
for i, page in enumerate(html_pages):
combined_html += f'<div class="content"><div class="page">{page}</div></div>'
if i < len(html_pages) - 1: # Only add page break if it's not the last page
combined_html += '<div style="page-break-after: always;"></div>'
combined_html += '</body></html>'
return combined_html
html_content = create_pdf_from_markdown_strings(st.session_state.summ_text)
pdf_file = generate_pdf(html_content)
st.write("\n\n\n")
# Provide download link
st.download_button(
label="Download PDF",
data=pdf_file,
file_name="slides.pdf",
mime="application/pdf"
)
st.markdown("""
-----------------------------------------
Great! Thank you for using this huggingface space.\n
If you want to know more about this application, you can take a look at the [paper](https://studenttheses.uu.nl/handle/20.500.12932/45939).\n
To contact the author you can send an email to [email protected];\n
To cite the paper you can use Bibtex\n
```
@mastersthesis{lu2024unsupervised,
title={Unsupervised Paper2Slides Generation},
author={Lu, Zehao},
year={2024}
}\n
```\n
To see how was the grobid's output is parsed, check [Grobidmonkey](https://github.com/com3dian/Grobidmonkey).
""")
|