Pclanglais commited on
Commit
a76021e
·
verified ·
1 Parent(s): 4c8a985

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -143
app.py CHANGED
@@ -1,85 +1,40 @@
1
  import transformers
2
  import re
3
- from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline
4
  import torch
5
  import gradio as gr
6
- import json
7
- import os
8
- import shutil
9
- import requests
10
  import pandas as pd
11
 
12
  # Define the device
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
 
15
- editorial_model = "PleIAs/Bibliography-Formatter"
16
- token_classifier = pipeline(
 
 
 
17
  "token-classification", model=editorial_model, aggregation_strategy="simple", device=device
18
  )
 
 
 
19
 
20
  tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)
21
 
22
- css = """
23
- <style>
24
- .manuscript {
25
- display: flex;
26
- margin-bottom: 10px;
27
- align-items: baseline;
28
- }
29
- .annotation {
30
- width: 15%;
31
- padding-right: 20px;
32
- color: grey !important;
33
- font-style: italic;
34
- text-align: right;
35
- }
36
- .content {
37
- width: 80%;
38
- }
39
- h2 {
40
- margin: 0;
41
- font-size: 1.5em;
42
- }
43
- .title-content h2 {
44
- font-weight: bold;
45
- }
46
- .bibliography-content {
47
- color:darkgreen !important;
48
- margin-top: -5px; /* Adjust if needed to align with annotation */
49
- }
50
-
51
- .paratext-content {
52
- color:#a4a4a4 !important;
53
- margin-top: -5px; /* Adjust if needed to align with annotation */
54
- }
55
- </style>
56
- """
57
-
58
- # Preprocess the 'word' column
59
  def preprocess_text(text):
60
- # Remove HTML tags
61
  text = re.sub(r'<[^>]+>', '', text)
62
- # Replace newlines with spaces
63
  text = re.sub(r'\n', ' ', text)
64
- # Replace multiple spaces with a single space
65
  text = re.sub(r'\s+', ' ', text)
66
- # Strip leading and trailing whitespace
67
  return text.strip()
68
-
69
  def split_text(text, max_tokens=500):
70
- # Split the text by newline characters
71
  parts = text.split("\n")
72
  chunks = []
73
  current_chunk = ""
74
 
75
  for part in parts:
76
- # Add part to current chunk
77
- if current_chunk:
78
- temp_chunk = current_chunk + "\n" + part
79
- else:
80
- temp_chunk = part
81
-
82
- # Tokenize the temporary chunk
83
  num_tokens = len(tokenizer.tokenize(temp_chunk))
84
 
85
  if num_tokens <= max_tokens:
@@ -92,7 +47,6 @@ def split_text(text, max_tokens=500):
92
  if current_chunk:
93
  chunks.append(current_chunk)
94
 
95
- # If no newlines were found and still exceeding max_tokens, split further
96
  if len(chunks) == 1 and len(tokenizer.tokenize(chunks[0])) > max_tokens:
97
  long_text = chunks[0]
98
  chunks = []
@@ -100,7 +54,6 @@ def split_text(text, max_tokens=500):
100
  split_point = len(long_text) // 2
101
  while split_point < len(long_text) and not re.match(r'\s', long_text[split_point]):
102
  split_point += 1
103
- # Ensure split_point does not go out of range
104
  if split_point >= len(long_text):
105
  split_point = len(long_text) - 1
106
  chunks.append(long_text[:split_point].strip())
@@ -118,7 +71,6 @@ def extract_year(text):
118
  return year_match.group(1) if year_match else None
119
 
120
  def create_bibtex_entry(data):
121
- # Determine the entry type
122
  if 'journal' in data:
123
  entry_type = 'article'
124
  elif 'booktitle' in data:
@@ -126,13 +78,11 @@ def create_bibtex_entry(data):
126
  else:
127
  entry_type = 'book'
128
 
129
- # Extract year from 'None' if it exists
130
  none_content = data.pop('none', '')
131
  year = extract_year(none_content)
132
  if year and 'year' not in data:
133
  data['year'] = year
134
 
135
- # Create BibTeX ID
136
  author_words = data.get('author', '').split()
137
  first_author = author_words[0] if author_words else 'Unknown'
138
  bibtex_id = f"{first_author}{year}" if year else first_author
@@ -149,98 +99,57 @@ def create_bibtex_entry(data):
149
  bibtex = bibtex.rstrip(',\n') + "\n}"
150
  return bibtex
151
 
152
- def transform_chunks(marianne_segmentation):
153
- marianne_segmentation = pd.DataFrame(marianne_segmentation)
154
- marianne_segmentation = marianne_segmentation[marianne_segmentation['entity_group'] != 'separator']
155
- marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).str.replace('¶', '\n', regex=False)
156
- marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).apply(preprocess_text)
157
- marianne_segmentation = marianne_segmentation[marianne_segmentation['word'].notna() & (marianne_segmentation['word'] != '') & (marianne_segmentation['word'] != ' ')]
158
-
159
- html_output = []
160
- bibtex_data = {}
161
- current_entity = None
162
-
163
- for _, row in marianne_segmentation.iterrows():
164
- entity_group = row['entity_group']
165
- result_entity = "[" + entity_group.capitalize() + "]"
166
- word = row['word']
167
-
168
- if entity_group != 'None':
169
- if entity_group in bibtex_data:
170
- bibtex_data[entity_group] += ' ' + word
171
- else:
172
- bibtex_data[entity_group] = word
173
- current_entity = entity_group
174
- else:
175
- if current_entity:
176
- bibtex_data[current_entity] += ' ' + word
177
- else:
178
- bibtex_data['None'] = bibtex_data.get('None', '') + ' ' + word
179
-
180
- html_output.append(f'<div class="manuscript"><div class="annotation">{result_entity}</div><div class="content">{word}</div></div>')
181
-
182
- bibtex_entry = create_bibtex_entry(bibtex_data)
183
-
184
- final_html = '\n'.join(html_output)
185
- return final_html, bibtex_entry
186
-
187
- # Class to encapsulate the Falcon chatbot
188
- class MistralChatBot:
189
- def __init__(self, system_prompt="Le dialogue suivant est une conversation"):
190
- self.system_prompt = system_prompt
191
-
192
- def predict(self, user_message):
193
  editorial_text = re.sub("\n", " ¶ ", user_message)
194
  num_tokens = len(tokenizer.tokenize(editorial_text))
195
 
196
- if num_tokens > 500:
197
- batch_prompts = split_text(editorial_text, max_tokens=500)
198
- else:
199
- batch_prompts = [editorial_text]
200
 
201
- out = token_classifier(batch_prompts)
202
- classified_list = []
203
- for classification in out:
204
- df = pd.DataFrame(classification)
205
- classified_list.append(df)
206
-
207
- classified_list = pd.concat(classified_list)
208
-
209
- # Debugging: Print the classified list
210
- print("Classified List:")
211
- print(classified_list)
212
 
213
- html_output, bibtex_entry = transform_chunks(classified_list)
 
214
 
215
- # Debugging: Print the outputs
216
- print("HTML Output:")
217
- print(html_output)
218
- print("BibTeX Entry:")
219
- print(bibtex_entry)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
- return bibtex_entry
222
 
223
- # Create the Falcon chatbot instance
224
- mistral_bot = MistralChatBot()
225
 
226
  # Define the Gradio interface
227
- title = "Éditorialisation"
228
- description = "Un outil expérimental d'identification de la structure du texte à partir d'un encoder (Deberta)"
229
- examples = [
230
- [
231
- "Qui peut bénéficier de l'AIP?", # user_message
232
- 0.7 # temperature
233
- ]
234
- ]
235
-
236
- demo = gr.Blocks()
237
-
238
  with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo:
239
- gr.HTML("""<h1 style="text-align:center">Reversed Zotero</h1>""")
240
- text_input = gr.Textbox(label="Your text", type="text", lines=5)
241
- text_button = gr.Button("Extract a structured bibtex")
242
- bibtex_output = gr.Textbox(label="BibTeX Entry", lines=10)
243
- text_button.click(mistral_bot.predict, inputs=text_input, outputs=[bibtex_output])
244
 
245
  if __name__ == "__main__":
246
  demo.queue().launch()
 
1
  import transformers
2
  import re
3
+ from transformers import AutoTokenizer, pipeline
4
  import torch
5
  import gradio as gr
 
 
 
 
6
  import pandas as pd
7
 
8
  # Define the device
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
10
 
11
+ # Load models
12
+ editorial_model = "PleIAs/Estienne"
13
+ bibliography_model = "PleIAs/Bibliography-Formatter"
14
+
15
+ editorial_classifier = pipeline(
16
  "token-classification", model=editorial_model, aggregation_strategy="simple", device=device
17
  )
18
+ bibliography_classifier = pipeline(
19
+ "token-classification", model=bibliography_model, aggregation_strategy="simple", device=device
20
+ )
21
 
22
  tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)
23
 
24
+ # Helper functions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def preprocess_text(text):
 
26
  text = re.sub(r'<[^>]+>', '', text)
 
27
  text = re.sub(r'\n', ' ', text)
 
28
  text = re.sub(r'\s+', ' ', text)
 
29
  return text.strip()
30
+
31
  def split_text(text, max_tokens=500):
 
32
  parts = text.split("\n")
33
  chunks = []
34
  current_chunk = ""
35
 
36
  for part in parts:
37
+ temp_chunk = current_chunk + "\n" + part if current_chunk else part
 
 
 
 
 
 
38
  num_tokens = len(tokenizer.tokenize(temp_chunk))
39
 
40
  if num_tokens <= max_tokens:
 
47
  if current_chunk:
48
  chunks.append(current_chunk)
49
 
 
50
  if len(chunks) == 1 and len(tokenizer.tokenize(chunks[0])) > max_tokens:
51
  long_text = chunks[0]
52
  chunks = []
 
54
  split_point = len(long_text) // 2
55
  while split_point < len(long_text) and not re.match(r'\s', long_text[split_point]):
56
  split_point += 1
 
57
  if split_point >= len(long_text):
58
  split_point = len(long_text) - 1
59
  chunks.append(long_text[:split_point].strip())
 
71
  return year_match.group(1) if year_match else None
72
 
73
  def create_bibtex_entry(data):
 
74
  if 'journal' in data:
75
  entry_type = 'article'
76
  elif 'booktitle' in data:
 
78
  else:
79
  entry_type = 'book'
80
 
 
81
  none_content = data.pop('none', '')
82
  year = extract_year(none_content)
83
  if year and 'year' not in data:
84
  data['year'] = year
85
 
 
86
  author_words = data.get('author', '').split()
87
  first_author = author_words[0] if author_words else 'Unknown'
88
  bibtex_id = f"{first_author}{year}" if year else first_author
 
99
  bibtex = bibtex.rstrip(',\n') + "\n}"
100
  return bibtex
101
 
102
+ class CombinedProcessor:
103
+ def process(self, user_message):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  editorial_text = re.sub("\n", " ¶ ", user_message)
105
  num_tokens = len(tokenizer.tokenize(editorial_text))
106
 
107
+ batch_prompts = split_text(editorial_text, max_tokens=500) if num_tokens > 500 else [editorial_text]
 
 
 
108
 
109
+ editorial_out = editorial_classifier(batch_prompts)
110
+ editorial_df = pd.concat([pd.DataFrame(classification) for classification in editorial_out])
 
 
 
 
 
 
 
 
 
111
 
112
+ # Filter out only bibliography entries
113
+ bibliography_entries = editorial_df[editorial_df['entity_group'] == 'bibliography']['word'].tolist()
114
 
115
+ bibtex_entries = []
116
+ for entry in bibliography_entries:
117
+ bib_out = bibliography_classifier(entry)
118
+ bib_df = pd.DataFrame(bib_out)
119
+
120
+ bibtex_data = {}
121
+ current_entity = None
122
+ for _, row in bib_df.iterrows():
123
+ entity_group = row['entity_group']
124
+ word = row['word']
125
+
126
+ if entity_group != 'None':
127
+ if entity_group in bibtex_data:
128
+ bibtex_data[entity_group] += ' ' + word
129
+ else:
130
+ bibtex_data[entity_group] = word
131
+ current_entity = entity_group
132
+ else:
133
+ if current_entity:
134
+ bibtex_data[current_entity] += ' ' + word
135
+ else:
136
+ bibtex_data['None'] = bibtex_data.get('None', '') + ' ' + word
137
+
138
+ bibtex_entry = create_bibtex_entry(bibtex_data)
139
+ bibtex_entries.append(bibtex_entry)
140
 
141
+ return bibtex_entries
142
 
143
+ # Create the processor instance
144
+ processor = CombinedProcessor()
145
 
146
  # Define the Gradio interface
 
 
 
 
 
 
 
 
 
 
 
147
  with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo:
148
+ gr.HTML("""<h1 style="text-align:center">Combined Editorial and Bibliography Processor</h1>""")
149
+ text_input = gr.Textbox(label="Your text", type="text", lines=10)
150
+ text_button = gr.Button("Process Text")
151
+ bibtex_output = gr.Textbox(label="BibTeX Entries", lines=15)
152
+ text_button.click(processor.process, inputs=text_input, outputs=[bibtex_output])
153
 
154
  if __name__ == "__main__":
155
  demo.queue().launch()