File size: 9,827 Bytes
d734a6a
 
 
 
 
 
 
 
 
 
ed238a0
d734a6a
 
ed238a0
d734a6a
 
 
ed238a0
 
 
 
 
 
 
 
 
 
 
 
d734a6a
ed238a0
d734a6a
ed238a0
d734a6a
ed238a0
 
 
 
d734a6a
ed238a0
 
 
d734a6a
ed238a0
 
d734a6a
 
 
 
ed238a0
d734a6a
 
ed238a0
 
d734a6a
 
ed238a0
 
d734a6a
ed238a0
d734a6a
ed238a0
 
 
 
d734a6a
 
ed238a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d734a6a
 
 
 
ed238a0
 
d734a6a
 
ed238a0
d734a6a
ed238a0
d734a6a
ed238a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d734a6a
 
 
 
 
 
 
 
ed238a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d734a6a
 
ed238a0
 
 
 
 
 
d734a6a
ed238a0
 
 
 
 
 
 
 
 
 
 
 
 
d734a6a
ed238a0
 
 
 
 
d734a6a
ed238a0
 
 
 
 
d734a6a
ed238a0
d734a6a
ed238a0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import gradio as gr
import json
from langchain_community.document_loaders import ArxivLoader
from langchain_community.document_loaders.merge import MergedDataLoader
from langchain_core.documents import Document
from typing import Iterator, List, Dict
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from umap import UMAP
import numpy as np
import tempfile
from collections import defaultdict

# 1. Data Loading
class CustomArxivLoader(ArxivLoader):
    def lazy_load(self) -> Iterator[Document]:
        documents = super().lazy_load()
        for document in documents:
            yield Document(
                page_content=document.page_content,
                metadata={
                    **document.metadata,
                    "ArxivId": self.query,
                    "Source": f"https://arxiv.org/pdf/{self.query}.pdf"
                }
            )

def load_documents_from_file(file_path: str) -> List[Document]:
    with open(file_path, "r") as f:
        results = json.load(f)
    
    arxiv_urls = results["collected_urls"]["arxiv.org"]
    arxiv_ids = [url.split("/")[-1].strip(".pdf") for url in arxiv_urls]
    
    loaders = [CustomArxivLoader(query=arxiv_id) for arxiv_id in arxiv_ids]
    merged_loader = MergedDataLoader(loaders=loaders)
    
    return merged_loader.load()

# 2. Topic Modeling
def create_topic_model(umap_params: Dict, bertopic_params: Dict) -> BERTopic:
    umap_model = UMAP(**umap_params)
    representation_model = KeyBERTInspired()
    
    return BERTopic(
        language="english",
        verbose=True,
        umap_model=umap_model,
        representation_model=representation_model,
        **bertopic_params
    )

def process_documents(documents: List[Document], topic_model: BERTopic) -> tuple:
    contents = [doc.page_content for doc in documents]
    topics, _ = topic_model.fit_transform(contents)
    topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, separator=' ')
    
    return topics, topic_labels

# 3. Data Manipulation
def create_docs_matrix(documents: List[Document], topics: List[int], labels: List[str]) -> List[List[str]]:
    return [
        [str(i), labels[topic], doc.metadata['Title']]
        for i, (doc, topic) in enumerate(zip(documents, topics))
    ]

def get_unique_topics(labels: List[str]) -> List[str]:
    return sorted(set(labels))

def remove_topics(state: Dict, topics_to_remove: List[str]) -> Dict:
    documents, topics, labels = state['documents'], state['topics'], state['labels']
    filtered_data = [
        (doc, topic, label)
        for doc, topic, label in zip(documents, topics, labels)
        if label not in topics_to_remove
    ]
    new_documents, new_topics, new_labels = map(list, zip(*filtered_data)) if filtered_data else ([], [], [])
    return {**state, 'documents': new_documents, 'topics': new_topics, 'labels': new_labels}

# 4. Output Generation
def create_markdown_content(state: Dict) -> str:
    documents, topics, labels = state['documents'], state['topics'], state['labels']
    if not documents or not labels:
        return "No data available for download."

    topic_documents = defaultdict(list)
    for doc, topic in zip(documents, topics):
        label = labels[topic]
        topic_documents[label].append(doc)

    content = ["# Arxiv Articles by Topic\n"]
    for topic, docs in topic_documents.items():
        content.append(f"## {topic}\n")
        for document in docs:
            content.append(f"### {document.metadata['Title']}")
            content.append(f"{document.metadata['Summary']}")  

    return "\n".join(content)

# 5. Gradio Interface
def create_gradio_interface():
    with gr.Blocks(theme="default") as demo:
        gr.Markdown("# BERT Topic Article Organizer App")
        gr.Markdown("Organizes arxiv articles in different topics and exports it in a zip file.")

        state = gr.State(value={})

        with gr.Row():
            file_uploader = gr.UploadButton("Click to upload", file_types=["json"], file_count="single")
            reprocess_button = gr.Button("Reprocess Documents")
            download_button = gr.Button("Download Results")

        with gr.Row():
            with gr.Column():
                umap_n_neighbors = gr.Slider(minimum=2, maximum=100, value=15, step=1, label="UMAP n_neighbors")
                umap_n_components = gr.Slider(minimum=2, maximum=100, value=5, step=1, label="UMAP n_components")
                umap_min_dist = gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.01, label="UMAP min_dist")
            with gr.Column():
                min_topic_size = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="BERTopic min_topic_size")
                nr_topics = gr.Slider(minimum=1, maximum=100, value="auto", step=1, label="BERTopic nr_topics")
                top_n_words = gr.Slider(minimum=5, maximum=50, value=10, step=1, label="BERTopic top_n_words")
                n_gram_range = gr.Slider(minimum=1, maximum=3, value=1, step=1, label="BERTopic n_gram_range")
                calculate_probabilities = gr.Checkbox(label="Calculate Probabilities", value=False)

        output_matrix = gr.DataFrame(
            label="Processing Result",
            headers=["ID", "Topic", "Title"],
            col_count=(3, "fixed"),
            interactive=False
        )

        with gr.Row():
            topic_dropdown = gr.Dropdown(label="Select Topics to Remove", multiselect=True, interactive=True)
            remove_topics_button = gr.Button("Remove Selected Topics")

        markdown_output = gr.File(label="Download Markdown")

        def update_ui(state: Dict):
            matrix = create_docs_matrix(state['documents'], state['topics'], state['labels'])
            unique_topics = get_unique_topics(state['labels'])
            return matrix, gr.Dropdown(choices=unique_topics, value=[]), unique_topics

        def process_and_update(state: Dict, umap_n_neighbors: int, umap_n_components: int, umap_min_dist: float, 
                               min_topic_size: int, nr_topics: int, top_n_words: int, n_gram_range: int, 
                               calculate_probabilities: bool):
            documents = state.get('documents', [])
            umap_params = {
                "n_neighbors": umap_n_neighbors, 
                "n_components": umap_n_components, 
                "min_dist": umap_min_dist
            }
            bertopic_params = {
                "min_topic_size": min_topic_size, 
                "nr_topics": nr_topics,
                "top_n_words": top_n_words,
                "n_gram_range": (1, n_gram_range),
                "calculate_probabilities": calculate_probabilities
            }
            
            topic_model = create_topic_model(umap_params, bertopic_params)
            topics, labels = process_documents(documents, topic_model)
            
            new_state = {**state, 'documents': documents, 'topics': topics, 'labels': labels}
            matrix, dropdown, unique_topics = update_ui(new_state)
            return new_state, matrix, dropdown, unique_topics

        def load_and_process(file, umap_n_neighbors, umap_n_components, umap_min_dist, 
                             min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities):
            documents = load_documents_from_file(file.name)
            state = {'documents': documents}
            return process_and_update(state, umap_n_neighbors, umap_n_components, umap_min_dist, 
                                      min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities)

        file_uploader.upload(
            fn=load_and_process,
            inputs=[file_uploader, umap_n_neighbors, umap_n_components, umap_min_dist, 
                    min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities],
            outputs=[state, output_matrix, topic_dropdown, topic_dropdown]
        )

        reprocess_button.click(
            fn=process_and_update,
            inputs=[state, umap_n_neighbors, umap_n_components, umap_min_dist, 
                    min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities],
            outputs=[state, output_matrix, topic_dropdown, topic_dropdown]
        )

        def remove_and_update(state: Dict, topics_to_remove: List[str], umap_n_neighbors: int, umap_n_components: int, 
                              umap_min_dist: float, min_topic_size: int, nr_topics: int, top_n_words: int, 
                              n_gram_range: int, calculate_probabilities: bool):
            new_state = remove_topics(state, topics_to_remove)
            return process_and_update(new_state, umap_n_neighbors, umap_n_components, umap_min_dist, 
                                      min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities)

        remove_topics_button.click(
            fn=remove_and_update,
            inputs=[state, topic_dropdown, umap_n_neighbors, umap_n_components, umap_min_dist, 
                    min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities],
            outputs=[state, output_matrix, topic_dropdown, topic_dropdown]
        )

        def create_download_file(state: Dict):
            content = create_markdown_content(state)
            with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md") as temp_file:
                temp_file.write(content)
            return temp_file.name

        download_button.click(
            fn=create_download_file,
            inputs=[state],
            outputs=[markdown_output]
        )

    return demo

if __name__ == "__main__":
    demo = create_gradio_interface()
    demo.launch(share=True, show_error=True, max_threads=10, debug=True)