Spaces:

ml6team
/

doc-to-slides

Sleeping

File size: 6,345 Bytes

ea5c59c
 
 
8d4620d
6f670c5
7ee71e2
b0eedcf
35fd393
b0eedcf
2341ee3
9ec3b13
ea5c59c
0cbdeb5
 
 
 
 
 
 
 
8d4620d
28c51ee
 
8d4620d
 
 
ea5c59c
8d4620d
ea5c59c
1ed0b9b
54f71b8
39f102c
 
 
 
 
 
 
0ed5911
39f102c
230d178
39f102c
 
 
230d178
39f102c
 
fda22ce
39f102c
 
0cbdeb5
39f102c
e514b11
39f102c
 
 
 
 
 
 
 
e514b11
39f102c
 
 
0cbdeb5
39f102c
 
 
 
d11bba3
39f102c
 
7ba9533
09f103b
 
 
c464fcc
77bbf66
6b99783
 
 
 
 
 
44e5e0c
6b99783
 
77bbf66
7af5631
77bbf66
 
7af5631
09f103b
6b99783
09f103b
659f042
 
 
 
 
77bbf66
 
2e05f78
77bbf66
 
 
a23cae0
77bbf66
354eef6
 
a23cae0
354eef6
7af5631
e7d6973
 
 
 
77bbf66
0265ddb
1da1b0f
b17cdba
77bbf66
 
b17cdba
 
77bbf66
b17cdba
 
 
 
 
 
 
 
 
034b0a5
77bbf66
 
6b99783
7af5631
b0eedcf
 
 
45e4e41
b0eedcf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16c4994
45e4e41
c06e0bf
5cf1f02
 
 
b2e85c2
5cf1f02
 
 
 
 
dbab208
5cf1f02
dbab208
 
b0eedcf
45e4e41
2341ee3
461e9ba
2341ee3
 
 
 
181cf6f
2341ee3
 
354eef6

import streamlit as st
import pandas as pd
import numpy as np
import os
import pickle
import torch
import markdown
from weasyprint import HTML, CSS
import io
from io import BytesIO
from grobidmonkey import reader

from transformers import pipeline
from transformers import BartTokenizer, BartModel, BartForConditionalGeneration
from transformers import T5Tokenizer, T5ForConditionalGeneration

from document import Document
from BartSE import BARTAutoEncoder


def save_uploaded_file(uploaded_file):
    file_path = os.path.join("./uploads", uploaded_file.name)
    os.makedirs("./uploads", exist_ok=True)  # Create 'uploads' directory if it doesn't exist
    with open(file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())
    return file_path  # Return the file path as a string

st.title('Paper2Slides')

st.subheader('Upload paper in pdf format')

col1, col2 = st.columns([3, 1])
with col1:
    uploaded_file = st.file_uploader("Choose a file")
with col2:
    option = st.selectbox(
        'Select parsing method.',
        ('monkey', 'x2d', 'lxml'))

if uploaded_file is not None:
    
    st.write(uploaded_file.name)
    bytes_data = uploaded_file.getvalue()
    st.write(len(bytes_data), "bytes")
    
    saved_file_path = save_uploaded_file(uploaded_file)
    monkeyReader = reader.MonkeyReader(option)
    
    # read paper content
    essay = monkeyReader.readEssay(saved_file_path)
        
    with st.status("Understanding paper..."):
        
        Barttokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
        summ_model_path = 'com3dian/Bart-large-paper2slides-summarizer'
        summarizor = BartForConditionalGeneration.from_pretrained(summ_model_path)
        exp_model_path = 'com3dian/Bart-large-paper2slides-expander'
        expandor = BartForConditionalGeneration.from_pretrained(exp_model_path)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        BartSE = BARTAutoEncoder(summarizor, summarizor, device)
        del summarizor, expandor
            
        document = Document(essay, Barttokenizer)
        del Barttokenizer
        length = document.merge(25, 30, BartSE, device)

    with st.status("Generating slides..."):
        summarizor = pipeline("summarization", model=summ_model_path, device = device)
        summ_text = summarizor(document.segmentation['text'], max_length=100, min_length=10, do_sample=False)
        summ_text = [text['summary_text'] for text in summ_text]
        
        for summ in summ_text:
            st.write(summ)


with open('slides_text.pkl', 'rb') as file:
    summ_text = pickle.load(file)

# Function to render HTML content
def format(text_list):
    format_list = []
    for text in text_list:
        # Split text by periods
        sentences = text.split('.')
        # Create HTML list items
        list_items = "".join([f"- {sentence.strip()}.\n" for sentence in sentences if sentence.strip()])
        format_list.append(list_items)
    return format_list

# Initialize session state for page index and text
if 'page_index' not in st.session_state:
    st.session_state.page_index = 0

if 'summ_text' not in st.session_state:
    st.session_state.summ_text = format(summ_text)

if 'current_text' not in st.session_state:
    st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index]



# Function to handle page turn
def turn_page(direction):
    if direction == "next" and st.session_state.page_index < len(summ_text) - 1:
        st.session_state.page_index += 1
    elif direction == "prev" and st.session_state.page_index > 0:
        st.session_state.page_index -= 1
    st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index]

# Function to update the current text based on text_area changes
def update_text():
    st.session_state.summ_text[st.session_state.page_index] = st.session_state.text_area_value
    st.session_state.current_text = st.session_state.text_area_value


# Display editable text box
text = st.text_area("Edit Text", st.session_state.current_text, height=200, key="text_area_value", on_change=update_text)

# Display page turner controls
col1, col2, col3 = st.columns([2.25, 12, 1.7])

# Previous button in col1
with col1:
    st.button("Previous", on_click=turn_page, args=("prev",))

# Center aligned text in col2
with col2:
    st.markdown(
        f'<div style="display: flex; justify-content: center; align-items: center; height: 100%;">'
        f'Page {st.session_state.page_index + 1} of {len(summ_text)}'
        f'</div>',
        unsafe_allow_html=True
    )

# Next button in col3, right aligned
with col3:
    st.button("Next", on_click=turn_page, args=("next",))

# Display HTML box
st.markdown(st.session_state.current_text)

def render_markdown_to_html(markdown_str):
    return markdown.markdown(markdown_str)

def create_pdf_from_markdown_strings(markdown_strings):
    html_pages = [render_markdown_to_html(md) for md in markdown_strings]

    # Combine HTML content with page breaks and add a style section for font size and margins
    combined_html = '''
    <html>
    <head>
        <style>
            .page { 
                font-size: 16pt; /* Adjust the font size as needed */
            }
        </style>
    </head>
    <body>
    '''
    for i, page in enumerate(html_pages):
        combined_html += f'<div class="page">{page}</div>'
        if i < len(html_pages) - 1:  # Only add page break after if it's not the last page
            combined_html += '<div style="page-break-after: always;"></div>'
    combined_html += '</body></html>'

    # PDF options: landscape orientation and page size
    options = {
        'page-size': 'A4',
        'orientation': 'Landscape'
    }

    return combined_html

def generate_pdf(html_string):
    css = """
    @page {
        size: A4 landscape;
        margin: 20mm;
    }
    body {
        font-family: sans-serif;
    }
    """
    pdf = BytesIO()
    HTML(string=html_string).write_pdf(pdf, stylesheets=[CSS(string=css)])
    pdf.seek(0)
    return pdf

html_content = create_pdf_from_markdown_strings(st.session_state.summ_text)
pdf_file = generate_pdf(html_content)
    
# Provide download link
st.download_button(
    label="Download PDF",
    data=pdf_file,
    file_name="slides.pdf",
    mime="application/pdf"
)