Spaces:

ml6team
/

doc-to-slides

Sleeping

File size: 12,129 Bytes

ea5c59c
 
 
8d4620d
6f670c5
7ee71e2
b0eedcf
35fd393
b0eedcf
2341ee3
9ec3b13
b8609ad
 
 
 
ea5c59c
0cbdeb5
 
 
 
 
 
 
 
bd0ad19
dff7979
45b30e7
 
dff7979
2135f24
7b0cbf1
 
 
25438e2
a28a21d
 
dff7979
7b0cbf1
cc907b2
73e56ea
f75d3f2
 
 
09a54a9
f75d3f2
 
 
e1d5db1
d43bd0b
1bf4865
d43bd0b
cc907b2
d43bd0b
cc907b2
 
 
 
 
8d4620d
28c51ee
 
8d4620d
 
 
ea5c59c
cc907b2
54f71b8
39f102c
 
 
 
 
 
 
7b8df24
0499963
c2c697a
0ed5911
f1dc184
39f102c
 
 
230d178
39f102c
 
fda22ce
39f102c
 
0cbdeb5
e3748f8
e514b11
39f102c
 
 
 
 
 
 
 
e514b11
39f102c
 
23cb72b
0cbdeb5
e3748f8
39f102c
48102c5
39f102c
 
d459820
 
09f103b
c2c697a
c7f2062
 
b8609ad
48102c5
c7f2062
48102c5
2e52433
b8609ad
 
c7f2062
b8609ad
48102c5
c7f2062
 
 
 
 
 
 
05ae92f
c7f2062
 
 
 
 
 
 
 
3d2f753
c7f2062
 
 
 
 
 
 
 
 
 
9da7fce
c7f2062
aa38316
 
6efd680
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa38316
6efd680
 
aa38316
 
 
 
6efd680
aa38316
 
 
6efd680
 
078e23b
c7f2062
 
 
 
 
 
 
 
 
 
078e23b
 
 
 
 
 
c7f2062
 
 
 
 
 
57fad31
 
 
 
 
c843614
ae440c0
57fad31
 
93cbec3
57fad31
 
 
 
 
 
 
92bfcdf
 
 
57fad31
c843614
 
92bfcdf
c843614
 
c7f2062
96e96c5
 
 
53cd408
 
96e96c5
 
 
6d6a2fe
53cd408
0c871a1
 
 
195956d
 
0c871a1
 
96e96c5
 
1b6b526
53cd408
 
96e96c5
 
 
 
 
 
53cd408
96e96c5
 
 
 
 
 
 
 
 
6d6a2fe
53cd408
 
96e96c5
 
 
 
 
 
0c871a1
96e96c5
 
 
 
 
0c871a1
c7f2062
 
77962b1
 
 
c7f2062
 
 
 
 
 
b17cdba
d8303d2
b8609ad
 
edbb1df
 
 
 
 
 
 
 
 
 
 
9877514
b17cdba

import streamlit as st
import pandas as pd
import numpy as np
import os
import pickle
import torch
import markdown
from weasyprint import HTML, CSS
import io
from io import BytesIO
from grobidmonkey import reader
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

from transformers import pipeline
from transformers import BartTokenizer, BartModel, BartForConditionalGeneration
from transformers import T5Tokenizer, T5ForConditionalGeneration

from document import Document
from BartSE import BARTAutoEncoder


st.title('Paper2Slides')
st.markdown("""
This space is a live demo of the [Zehao Lu](https://www.linkedin.com/in/zehao-lu/)’s [thesis](https://studenttheses.uu.nl/handle/20.500.12932/45939) 
at Utrecht University (and internship project at [ML6](https://www.ml6.eu/)),
supervised by [Guanyi Chen](https://a-quei.github.io/) (During his time in Utrecht University) and
[Konstantin Buschmer](https://www.linkedin.com/in/konstantin-buschmeier/) (ML6).

To use this space:

1. Have a paper that you want to turn into slides.
2. Process your paper using GROBID. If you have GROBID installed, run it and use the output. If not, you can use GROBID’s [live demo](https://kermitt2-grobid.hf.space/)
to generate the processed TEI.xml file. To use the live demo, click on `TEI`, select `Process Fulltext Document` under Service to call, choose the paper file, and then
click `submit`.
""")

st.image("grobidmanual.gif")
st.markdown("### Now let's try **Paper2Slides**!")
st.markdown("""
To use this space, you need to:

1. Set the number of slides you want to generate.
2. Update the processed `tei.xml` file.
""")

st.subheader('Set slide numbers')
st.markdown("Specify the range of slide numbers you want to generate.")

range_values = st.slider(
    'Select a range',
    min_value=0,
    max_value=100,
    value=(0, 25)
    )

def save_uploaded_file(uploaded_file):
    file_path = os.path.join("./uploads", uploaded_file.name)
    os.makedirs("./uploads", exist_ok=True)  # Create 'uploads' directory if it doesn't exist
    with open(file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())
    return file_path  # Return the file path as a string

st.subheader('Upload paper in TEI.xml format')

col1, col2 = st.columns([3, 1])
with col1:
    uploaded_file = st.file_uploader("Choose a file")
with col2:
    option = st.selectbox(
        'Select parsing method.',
        ('monkey', 'x2d', 'lxml'))
    

summ_text = None

if (uploaded_file is not None) and (not 'generation_done' in st.session_state):
    st.write(uploaded_file.name)
    bytes_data = uploaded_file.getvalue()
    st.write(len(bytes_data), "bytes")
    
    saved_file_path = save_uploaded_file(uploaded_file)
    monkeyReader = reader.MonkeyReader(option)
    
    # read paper content
    essay = monkeyReader.readEssay(saved_file_path)
        
    with st.status("Understanding paper...\nThis might take a while, feel free to grab a coffee!"):
        
        Barttokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
        summ_model_path = 'com3dian/Bart-large-paper2slides-summarizer'
        summarizor = BartForConditionalGeneration.from_pretrained(summ_model_path)
        exp_model_path = 'com3dian/Bart-large-paper2slides-expander'
        expandor = BartForConditionalGeneration.from_pretrained(exp_model_path)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        BartSE = BARTAutoEncoder(summarizor, summarizor, device)
        del summarizor, expandor
            
        document = Document(essay, Barttokenizer)
        del Barttokenizer
        length = document.merge(range_values[0],range_values[1], BartSE, device)

    with st.status("Generating slides...\nThey'll be ready shortly!"):
        summarizor = pipeline("summarization", model=summ_model_path, device = device)
        title_list = document.segmentation['key']
        summ_text = summarizor(document.segmentation['text'], max_length=100, min_length=10, do_sample=False)
        summ_text = [text['summary_text'] for text in summ_text]
    
    st.session_state.generation_done = True

if (summ_text is not None) or ('summ_text' in st.session_state):
    
    # Function to render HTML content

    def format(title_list, text_list):
        format_list = []
        for index, text in enumerate(text_list):
            title = "## " + title_list[index] + "\n"
            # Split text into sentences using nltk's sent_tokenize
            sentences = sent_tokenize(text)
            # Create HTML list items
            list_items = "".join([f"- {sentence.strip()}\n" for sentence in sentences if sentence.strip()])
            format_list.append(title + list_items)
        return format_list
    
    # Initialize session state for page index and text
    if 'page_index' not in st.session_state:
        st.session_state.page_index = 0
    
    if 'summ_text' not in st.session_state:
        st.session_state.summ_text = format(title_list, summ_text)
    
    if 'current_text' not in st.session_state:
        st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index]
    
    
    
    # Function to handle page turn
    def turn_page(direction):
        if direction == "next" and st.session_state.page_index < len(st.session_state.summ_text) - 1:
            st.session_state.page_index += 1
        elif direction == "prev" and st.session_state.page_index > 0:
            st.session_state.page_index -= 1
        st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index]
    
    # Function to update the current text based on text_area changes
    def update_text():
        st.session_state.summ_text[st.session_state.page_index] = st.session_state.text_area_value
        st.session_state.current_text = st.session_state.text_area_value
    
    st.subheader('Generated slides content')
    # Display editable text box
    text = st.text_area("Edit Text", st.session_state.current_text, height=200, key="text_area_value", on_change=update_text)
    
    # # Display the framed text area
    # # st.markdown('<div class="framed-text-area">', unsafe_allow_html=True)
    # # text = st.text_area(
    # #     "Edit Text",
    # #     st.session_state.current_text,
    # #     height=200,
    # #     key="text_area_value",
    # #     on_change=update_text
    # # )
    # # st.markdown('</div>', unsafe_allow_html=True)

    # # Define custom CSS
    # custom_css = """
    # <style>
    #     .framed-text-area {
    #         border: 2px solid #000000;
    #         border-radius: 5px;
    #         padding: 10px;
    #         margin: 10px 0;
    #     }
    #     .framed-text-area .stTextArea {
    #         border: none;
    #     }
    # </style>
    # """
    
    # # Inject custom CSS
    # st.markdown(custom_css, unsafe_allow_html=True)
    
    # # Create a container with the custom class
    # st.markdown('<div class="framed-text-area">', unsafe_allow_html=True)
    
    # # Your existing text area
    # text = st.text_area(
    #     "Edit Text",
    #     st.session_state.current_text,
    #     height=200,
    #     key="slide_text_area_value",
    #     on_change=update_text
    # )
    
    # # Close the container
    # st.markdown('</div>', unsafe_allow_html=True)

    
    # Display page turner controls
    col1, col2, col3 = st.columns([2.25, 12, 1.7])
    
    # Previous button in col1
    with col1:
        st.button("Previous", on_click=turn_page, args=("prev",))
    
    # Center aligned text in col2
    with col2:
        st.markdown(
            f'<div style="display: flex; justify-content: center; align-items: center; height: 100%;">'
            f'Page {st.session_state.page_index + 1} of {len(st.session_state.summ_text)}'
            f'</div>',
            unsafe_allow_html=True
        )
    
    # Next button in col3, right aligned
    with col3:
        st.button("Next", on_click=turn_page, args=("next",))
    
    # Display HTML box
    # st.markdown(st.session_state.current_text)

    # CSS styling to create a frame
    frame_css = """
    <style>
        .framed-markdown {
            border: 2px solid #a2a3a2;  /* Border color */
            padding: 10px;              /* Space inside the border */
            border-radius: 5px;         /* Rounded corners */
            background-color: transparent;;  /* Background color */
            margin: 10px 0;             /* Margin around the frame */
        }
    </style>
    """
    
    # Inject CSS into the Streamlit app
    st.markdown(frame_css, unsafe_allow_html=True)

    def render_markdown_to_html(markdown_str):
        return markdown.markdown(markdown_str)
    
    # Render the markdown content within the framed box
    st.markdown(
        f'<div class="framed-markdown">{render_markdown_to_html(st.session_state.current_text)}</div>', 
        unsafe_allow_html=True
    )
    
    def generate_pdf(html_string):
        css = """
        @page {
            size: 1920px 1080px; /* Set page size to Full HD resolution */
            margin: 0; /* Remove all margins */
        }
        body {
            font-family: sans-serif;
            background-color: #45474B; /* Set background color to grey */
            margin: 0; /* Remove body margin */
            padding: 0; /* Remove body padding */
        }
        .content {
            background-color: #45474B; /* Ensure the background color spans the full page */
            color: #F5F7F8; /* Set font color to white */
            padding: 20mm; /* Set padding to create text margins */
            box-sizing: border-box; /* Include padding in the element's total width and height */
        }
        .page {
            font-size: 32pt; /* Adjust the font size as needed */
            margin: 0; /* Remove margin from page content */
            padding: 0; /* Remove padding from page content */
        }
        """
        pdf = BytesIO()
        HTML(string=html_string).write_pdf(pdf, stylesheets=[CSS(string=css)])
        pdf.seek(0)
        return pdf
    
    def create_pdf_from_markdown_strings(markdown_strings):
        html_pages = [render_markdown_to_html(md) for md in markdown_strings]
    
        # Combine HTML content with page breaks and add a style section for font size, margins, background color, and font color
        combined_html = '''
        <html>
        <head>
            <style>
                .page { 
                    font-size: 32pt; /* Adjust the font size as needed */
                    margin: 0; /* Remove margin from page content */
                    padding: 0; /* Remove padding from page content */
                }
            </style>
        </head>
        <body>
        '''
        for i, page in enumerate(html_pages):
            combined_html += f'<div class="content"><div class="page">{page}</div></div>'
            if i < len(html_pages) - 1:  # Only add page break if it's not the last page
                combined_html += '<div style="page-break-after: always;"></div>'
        combined_html += '</body></html>'
    
        return combined_html
        
    html_content = create_pdf_from_markdown_strings(st.session_state.summ_text)
    pdf_file = generate_pdf(html_content)

    st.write("\n\n\n")
    
    # Provide download link
    st.download_button(
        label="Download PDF",
        data=pdf_file,
        file_name="slides.pdf",
        mime="application/pdf"
    )
    st.markdown("""
-----------------------------------------
Great! Thank you for using this huggingface space.\n
If you want to know more about this application, you can take a look at the [paper](https://studenttheses.uu.nl/handle/20.500.12932/45939).\n
To contact the author you can send an email to [email protected];\n
To cite the paper you can use Bibtex\n
```
@mastersthesis{lu2024unsupervised,
  title={Unsupervised Paper2Slides Generation},
  author={Lu, Zehao},
  year={2024}
}\n
```\n
To see how was the grobid's output is parsed, check [Grobidmonkey](https://github.com/com3dian/Grobidmonkey).
    """)