File size: 7,190 Bytes
ea5c59c
 
 
8d4620d
6f670c5
7ee71e2
b0eedcf
35fd393
b0eedcf
2341ee3
9ec3b13
ea5c59c
0cbdeb5
 
 
 
 
 
 
 
8d4620d
28c51ee
 
8d4620d
 
 
ea5c59c
8d4620d
ea5c59c
1ed0b9b
54f71b8
39f102c
 
 
 
 
 
 
48102c5
 
 
 
 
 
 
0499963
c2c697a
0ed5911
f1dc184
39f102c
 
 
230d178
39f102c
 
fda22ce
39f102c
 
0cbdeb5
39f102c
e514b11
39f102c
 
 
 
 
 
 
 
e514b11
39f102c
 
23cb72b
0cbdeb5
39f102c
 
48102c5
39f102c
 
d459820
 
09f103b
c2c697a
c7f2062
 
48102c5
c7f2062
48102c5
 
c7f2062
 
 
 
48102c5
c7f2062
 
 
 
 
 
 
05ae92f
c7f2062
 
 
 
 
 
 
 
3d2f753
c7f2062
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff55c20
c7f2062
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b17cdba
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import streamlit as st
import pandas as pd
import numpy as np
import os
import pickle
import torch
import markdown
from weasyprint import HTML, CSS
import io
from io import BytesIO
from grobidmonkey import reader

from transformers import pipeline
from transformers import BartTokenizer, BartModel, BartForConditionalGeneration
from transformers import T5Tokenizer, T5ForConditionalGeneration

from document import Document
from BartSE import BARTAutoEncoder


def save_uploaded_file(uploaded_file):
    file_path = os.path.join("./uploads", uploaded_file.name)
    os.makedirs("./uploads", exist_ok=True)  # Create 'uploads' directory if it doesn't exist
    with open(file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())
    return file_path  # Return the file path as a string

st.title('Paper2Slides')

st.subheader('Upload paper in pdf format')

col1, col2 = st.columns([3, 1])
with col1:
    uploaded_file = st.file_uploader("Choose a file")
with col2:
    option = st.selectbox(
        'Select parsing method.',
        ('monkey', 'x2d', 'lxml'))

range_values = st.slider(
    'Select a range of values',
    min_value=0,
    max_value=100,
    value=(0, 25)
    )

summ_text = None

if (uploaded_file is not None) and (not 'generation_done' in st.session_state):
    st.write(uploaded_file.name)
    bytes_data = uploaded_file.getvalue()
    st.write(len(bytes_data), "bytes")
    
    saved_file_path = save_uploaded_file(uploaded_file)
    monkeyReader = reader.MonkeyReader(option)
    
    # read paper content
    essay = monkeyReader.readEssay(saved_file_path)
        
    with st.status("Understanding paper..."):
        
        Barttokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
        summ_model_path = 'com3dian/Bart-large-paper2slides-summarizer'
        summarizor = BartForConditionalGeneration.from_pretrained(summ_model_path)
        exp_model_path = 'com3dian/Bart-large-paper2slides-expander'
        expandor = BartForConditionalGeneration.from_pretrained(exp_model_path)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        BartSE = BARTAutoEncoder(summarizor, summarizor, device)
        del summarizor, expandor
            
        document = Document(essay, Barttokenizer)
        del Barttokenizer
        length = document.merge(range_values[0],range_values[1], BartSE, device)

    with st.status("Generating slides..."):
        summarizor = pipeline("summarization", model=summ_model_path, device = device)
        title_list = document.segmentation['key']
        summ_text = summarizor(document.segmentation['text'], max_length=100, min_length=10, do_sample=False)
        summ_text = [text['summary_text'] for text in summ_text]
    
    st.session_state.generation_done = True

if (summ_text is not None) or ('summ_text' in st.session_state):
    
    # Function to render HTML content
    def format(title_list, text_list):
        format_list = []
        for index, text in enumerate(text_list):
            title = "##" + title_list[index] + "\n"
            # Split text by periods
            sentences = text.split('.')
            # Create HTML list items
            list_items = "".join([f"- {sentence.strip()}.\n" for sentence in sentences if sentence.strip()])
            format_list.append(title + list_items)
        return format_list
    
    # Initialize session state for page index and text
    if 'page_index' not in st.session_state:
        st.session_state.page_index = 0
    
    if 'summ_text' not in st.session_state:
        st.session_state.summ_text = format(title_list, summ_text)
    
    if 'current_text' not in st.session_state:
        st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index]
    
    
    
    # Function to handle page turn
    def turn_page(direction):
        if direction == "next" and st.session_state.page_index < len(st.session_state.summ_text) - 1:
            st.session_state.page_index += 1
        elif direction == "prev" and st.session_state.page_index > 0:
            st.session_state.page_index -= 1
        st.session_state.current_text = st.session_state.summ_text[st.session_state.page_index]
    
    # Function to update the current text based on text_area changes
    def update_text():
        st.session_state.summ_text[st.session_state.page_index] = st.session_state.text_area_value
        st.session_state.current_text = st.session_state.text_area_value
    
    
    # Display editable text box
    text = st.text_area("Edit Text", st.session_state.current_text, height=200, key="text_area_value", on_change=update_text)
    
    # Display page turner controls
    col1, col2, col3 = st.columns([2.25, 12, 1.7])
    
    # Previous button in col1
    with col1:
        st.button("Previous", on_click=turn_page, args=("prev",))
    
    # Center aligned text in col2
    with col2:
        st.markdown(
            f'<div style="display: flex; justify-content: center; align-items: center; height: 100%;">'
            f'Page {st.session_state.page_index + 1} of {len(st.session_state.summ_text)}'
            f'</div>',
            unsafe_allow_html=True
        )
    
    # Next button in col3, right aligned
    with col3:
        st.button("Next", on_click=turn_page, args=("next",))
    
    # Display HTML box
    st.markdown(st.session_state.current_text)
    
    def render_markdown_to_html(markdown_str):
        return markdown.markdown(markdown_str)
    
    def create_pdf_from_markdown_strings(markdown_strings):
        html_pages = [render_markdown_to_html(md) for md in markdown_strings]
    
        # Combine HTML content with page breaks and add a style section for font size and margins
        combined_html = '''
        <html>
        <head>
            <style>
                .page { 
                    font-size: 16pt; /* Adjust the font size as needed */
                }
            </style>
        </head>
        <body>
        '''
        for i, page in enumerate(html_pages):
            combined_html += f'<div class="page">{page}</div>'
            if i < len(html_pages) - 1:  # Only add page break after if it's not the last page
                combined_html += '<div style="page-break-after: always;"></div>'
        combined_html += '</body></html>'
    
        # PDF options: landscape orientation and page size
        options = {
            'page-size': 'A4',
            'orientation': 'Landscape'
        }
    
        return combined_html
    
    def generate_pdf(html_string):
        css = """
        @page {
            size: A4 landscape;
            margin: 20mm;
        }
        body {
            font-family: sans-serif;
        }
        """
        pdf = BytesIO()
        HTML(string=html_string).write_pdf(pdf, stylesheets=[CSS(string=css)])
        pdf.seek(0)
        return pdf
    
    html_content = create_pdf_from_markdown_strings(st.session_state.summ_text)
    pdf_file = generate_pdf(html_content)
        
    # Provide download link
    st.download_button(
        label="Download PDF",
        data=pdf_file,
        file_name="slides.pdf",
        mime="application/pdf"
    )