Spaces:

ml6team
/

doc-to-slides

Sleeping

App Files Files Community

doc-to-slides / app.py

com3dian

Update app.py

f75d3f2 verified 9 months ago

raw

history blame contribute delete

12.1 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import os
	import pickle
	import torch
	import markdown
	from weasyprint import HTML, CSS
	import io
	from io import BytesIO
	from grobidmonkey import reader
	import nltk
	nltk.download('punkt')
	nltk.download('punkt_tab')
	from nltk.tokenize import sent_tokenize

	from transformers import pipeline
	from transformers import BartTokenizer, BartModel, BartForConditionalGeneration
	from transformers import T5Tokenizer, T5ForConditionalGeneration

	from document import Document
	from BartSE import BARTAutoEncoder


	st.title('Paper2Slides')
	st.markdown("""
	This space is a live demo of the [Zehao Lu](https://www.linkedin.com/in/zehao-lu/)’s [thesis](https://studenttheses.uu.nl/handle/20.500.12932/45939)
	at Utrecht University (and internship project at [ML6](https://www.ml6.eu/)),
	supervised by [Guanyi Chen](https://a-quei.github.io/) (During his time in Utrecht University) and
	[Konstantin Buschmer](https://www.linkedin.com/in/konstantin-buschmeier/) (ML6).

	To use this space:

	1. Have a paper that you want to turn into slides.
	2. Process your paper using GROBID. If you have GROBID installed, run it and use the output. If not, you can use GROBID’s [live demo](https://kermitt2-grobid.hf.space/)
	to generate the processed TEI.xml file. To use the live demo, click on `TEI`, select `Process Fulltext Document` under Service to call, choose the paper file, and then
	click `submit`.
	""")

	st.image("grobidmanual.gif")
	st.markdown("### Now let's try Paper2Slides!")
	st.markdown("""
	To use this space, you need to:

	1. Set the number of slides you want to generate.
	2. Update the processed `tei.xml` file.
	""")

	st.subheader('Set slide numbers')
	st.markdown("Specify the range of slide numbers you want to generate.")

	range_values = st.slider(
	'Select a range',
	min_value=0,
	max_value=100,
	value=(0, 25)
	)

	def save_uploaded_file(uploaded_file):
	file_path = os.path.join("./uploads", uploaded_file.name)
	os.makedirs("./uploads", exist_ok=True) # Create 'uploads' directory if it doesn't exist
	with open(file_path, "wb") as f:
	f.write(uploaded_file.getbuffer())
	return file_path # Return the file path as a string

	st.subheader('Upload paper in TEI.xml format')

	col1, col2 = st.columns([3, 1])
	with col1:
	uploaded_file = st.file_uploader("Choose a file")
	with col2:
	option = st.selectbox(
	'Select parsing method.',
	('monkey', 'x2d', 'lxml'))


	summ_text = None

	if (uploaded_file is not None) and (not 'generation_done' in st.session_state):
	st.write(uploaded_file.name)
	bytes_data = uploaded_file.getvalue()
	st.write(len(bytes_data), "bytes")

	saved_file_path = save_uploaded_file(uploaded_file)
	monkeyReader = reader.MonkeyReader(option)

	# read paper content
	essay = monkeyReader.readEssay(saved_file_path)

	with st.status("Understanding paper...\nThis might take a while, feel free to grab a coffee!"):

	Barttokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
	summ_model_path = 'com3dian/Bart-large-paper2slides-summarizer'
	summarizor = BartForConditionalGeneration.from_pretrained(summ_model_path)
	exp_model_path = 'com3dian/Bart-large-paper2slides-expander'
	expandor = BartForConditionalGeneration.from_pretrained(exp_model_path)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	BartSE = BARTAutoEncoder(summarizor, summarizor, device)
	del summarizor, expandor

	document = Document(essay, Barttokenizer)
	del Barttokenizer
	length = document.merge(range_values[0],range_values[1], BartSE, device)

	with st.status("Generating slides...\nThey'll be ready shortly!"):
	summarizor = pipeline("summarization", model=summ_model_path, device = device)
	title_list = document.segmentation['key']
	summ_text = summarizor(document.segmentation['text'], max_length=100, min_length=10, do_sample=False)
	summ_text = [text['summary_text'] for text in summ_text]

	st.session_state.generation_done = True

	if (summ_text is not None) or ('summ_text' in st.session_state):

	# Function to render HTML content

	def format(title_list, text_list):
	format_list = []
	for index, text in enumerate(text_list):
	title = "## " + title_list[index] + "\n"
	# Split text into sentences using nltk's sent_tokenize
	sentences = sent_tokenize(text)
	# Create HTML list items
	list_items = "".join([f"- {sentence.strip()}\n" for sentence in sentences if sentence.strip()])
	format_list.append(title + list_items)
	return format_list

	# Initialize session state for page index and text
	if 'page_index' not in st.session_state:
	st.session_state.page_index = 0

	if 'summ_text' not in st.session_state:
	st.session_state.summ_text = format(title_list, summ_text)

	if 'current_text' not in st.session_state:
	st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index]



	# Function to handle page turn
	def turn_page(direction):
	if direction == "next" and st.session_state.page_index < len(st.session_state.summ_text) - 1:
	st.session_state.page_index += 1
	elif direction == "prev" and st.session_state.page_index > 0:
	st.session_state.page_index -= 1
	st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index]

	# Function to update the current text based on text_area changes
	def update_text():
	st.session_state.summ_text[st.session_state.page_index] = st.session_state.text_area_value
	st.session_state.current_text = st.session_state.text_area_value

	st.subheader('Generated slides content')
	# Display editable text box
	text = st.text_area("Edit Text", st.session_state.current_text, height=200, key="text_area_value", on_change=update_text)

	# # Display the framed text area
	# # st.markdown('<div class="framed-text-area">', unsafe_allow_html=True)
	# # text = st.text_area(
	# # "Edit Text",
	# # st.session_state.current_text,
	# # height=200,
	# # key="text_area_value",
	# # on_change=update_text
	# # )
	# # st.markdown('</div>', unsafe_allow_html=True)

	# # Define custom CSS
	# custom_css = """
	# <style>
	# .framed-text-area {
	# border: 2px solid #000000;
	# border-radius: 5px;
	# padding: 10px;
	# margin: 10px 0;
	# }
	# .framed-text-area .stTextArea {
	# border: none;
	# }
	# </style>
	# """

	# # Inject custom CSS
	# st.markdown(custom_css, unsafe_allow_html=True)

	# # Create a container with the custom class
	# st.markdown('<div class="framed-text-area">', unsafe_allow_html=True)

	# # Your existing text area
	# text = st.text_area(
	# "Edit Text",
	# st.session_state.current_text,
	# height=200,
	# key="slide_text_area_value",
	# on_change=update_text
	# )

	# # Close the container
	# st.markdown('</div>', unsafe_allow_html=True)


	# Display page turner controls
	col1, col2, col3 = st.columns([2.25, 12, 1.7])

	# Previous button in col1
	with col1:
	st.button("Previous", on_click=turn_page, args=("prev",))

	# Center aligned text in col2
	with col2:
	st.markdown(
	f'<div style="display: flex; justify-content: center; align-items: center; height: 100%;">'
	f'Page {st.session_state.page_index + 1} of {len(st.session_state.summ_text)}'
	f'</div>',
	unsafe_allow_html=True
	)

	# Next button in col3, right aligned
	with col3:
	st.button("Next", on_click=turn_page, args=("next",))

	# Display HTML box
	# st.markdown(st.session_state.current_text)

	# CSS styling to create a frame
	frame_css = """
	<style>
	.framed-markdown {
	border: 2px solid #a2a3a2; /* Border color */
	padding: 10px; /* Space inside the border */
	border-radius: 5px; /* Rounded corners */
	background-color: transparent;; /* Background color */
	margin: 10px 0; /* Margin around the frame */
	}
	</style>
	"""

	# Inject CSS into the Streamlit app
	st.markdown(frame_css, unsafe_allow_html=True)

	def render_markdown_to_html(markdown_str):
	return markdown.markdown(markdown_str)

	# Render the markdown content within the framed box
	st.markdown(
	f'<div class="framed-markdown">{render_markdown_to_html(st.session_state.current_text)}</div>',
	unsafe_allow_html=True
	)

	def generate_pdf(html_string):
	css = """
	@page {
	size: 1920px 1080px; /* Set page size to Full HD resolution */
	margin: 0; /* Remove all margins */
	}
	body {
	font-family: sans-serif;
	background-color: #45474B; /* Set background color to grey */
	margin: 0; /* Remove body margin */
	padding: 0; /* Remove body padding */
	}
	.content {
	background-color: #45474B; /* Ensure the background color spans the full page */
	color: #F5F7F8; /* Set font color to white */
	padding: 20mm; /* Set padding to create text margins */
	box-sizing: border-box; /* Include padding in the element's total width and height */
	}
	.page {
	font-size: 32pt; /* Adjust the font size as needed */
	margin: 0; /* Remove margin from page content */
	padding: 0; /* Remove padding from page content */
	}
	"""
	pdf = BytesIO()
	HTML(string=html_string).write_pdf(pdf, stylesheets=[CSS(string=css)])
	pdf.seek(0)
	return pdf

	def create_pdf_from_markdown_strings(markdown_strings):
	html_pages = [render_markdown_to_html(md) for md in markdown_strings]

	# Combine HTML content with page breaks and add a style section for font size, margins, background color, and font color
	combined_html = '''
	<html>
	<head>
	<style>
	.page {
	font-size: 32pt; /* Adjust the font size as needed */
	margin: 0; /* Remove margin from page content */
	padding: 0; /* Remove padding from page content */
	}
	</style>
	</head>
	<body>
	'''
	for i, page in enumerate(html_pages):
	combined_html += f'<div class="content"><div class="page">{page}</div></div>'
	if i < len(html_pages) - 1: # Only add page break if it's not the last page
	combined_html += '<div style="page-break-after: always;"></div>'
	combined_html += '</body></html>'

	return combined_html

	html_content = create_pdf_from_markdown_strings(st.session_state.summ_text)
	pdf_file = generate_pdf(html_content)

	st.write("\n\n\n")

	# Provide download link
	st.download_button(
	label="Download PDF",
	data=pdf_file,
	file_name="slides.pdf",
	mime="application/pdf"
	)
	st.markdown("""
	-----------------------------------------
	Great! Thank you for using this huggingface space.\n
	If you want to know more about this application, you can take a look at the [paper](https://studenttheses.uu.nl/handle/20.500.12932/45939).\n
	To contact the author you can send an email to [email protected];\n
	To cite the paper you can use Bibtex\n
	```
	@mastersthesis{lu2024unsupervised,
	title={Unsupervised Paper2Slides Generation},
	author={Lu, Zehao},
	year={2024}
	}\n
	```\n
	To see how was the grobid's output is parsed, check [Grobidmonkey](https://github.com/com3dian/Grobidmonkey).
	""")