Something

Running

App Files Files Community

Something / app.py

Pclanglais

Update app.py

4405d10 verified 12 months ago

raw

history blame

8.4 kB

	import transformers
	import re
	from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline
	import torch
	import gradio as gr
	import json
	import os
	import shutil
	import requests
	import pandas as pd

	# Define the device
	device = "cuda" if torch.cuda.is_available() else "cpu"

	editorial_model = "PleIAs/Bibliography-Formatter"
	token_classifier = pipeline(
	"token-classification", model=editorial_model, aggregation_strategy="simple", device=device
	)

	tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)

	css = """
	<style>
	.manuscript {
	display: flex;
	margin-bottom: 10px;
	align-items: baseline;
	}
	.annotation {
	width: 15%;
	padding-right: 20px;
	color: grey !important;
	font-style: italic;
	text-align: right;
	}
	.content {
	width: 80%;
	}
	h2 {
	margin: 0;
	font-size: 1.5em;
	}
	.title-content h2 {
	font-weight: bold;
	}
	.bibliography-content {
	color:darkgreen !important;
	margin-top: -5px; /* Adjust if needed to align with annotation */
	}

	.paratext-content {
	color:#a4a4a4 !important;
	margin-top: -5px; /* Adjust if needed to align with annotation */
	}
	</style>
	"""

	# Preprocess the 'word' column
	def preprocess_text(text):
	# Remove HTML tags
	text = re.sub(r'<[^>]+>', '', text)
	# Replace newlines with spaces
	text = re.sub(r'\n', ' ', text)
	# Replace multiple spaces with a single space
	text = re.sub(r'\s+', ' ', text)
	# Strip leading and trailing whitespace
	return text.strip()

	def split_text(text, max_tokens=500):
	# Split the text by newline characters
	parts = text.split("\n")
	chunks = []
	current_chunk = ""

	for part in parts:
	# Add part to current chunk
	if current_chunk:
	temp_chunk = current_chunk + "\n" + part
	else:
	temp_chunk = part

	# Tokenize the temporary chunk
	num_tokens = len(tokenizer.tokenize(temp_chunk))

	if num_tokens <= max_tokens:
	current_chunk = temp_chunk
	else:
	if current_chunk:
	chunks.append(current_chunk)
	current_chunk = part

	if current_chunk:
	chunks.append(current_chunk)

	# If no newlines were found and still exceeding max_tokens, split further
	if len(chunks) == 1 and len(tokenizer.tokenize(chunks[0])) > max_tokens:
	long_text = chunks[0]
	chunks = []
	while len(tokenizer.tokenize(long_text)) > max_tokens:
	split_point = len(long_text) // 2
	while split_point < len(long_text) and not re.match(r'\s', long_text[split_point]):
	split_point += 1
	# Ensure split_point does not go out of range
	if split_point >= len(long_text):
	split_point = len(long_text) - 1
	chunks.append(long_text[:split_point].strip())
	long_text = long_text[split_point:].strip()
	if long_text:
	chunks.append(long_text)

	return chunks

	def create_bibtex_entry(data):
	author = data.get('Author', '').strip()
	title = data.get('Title', '').strip()
	journal = data.get('Journal', '').strip()
	year = data.get('Year', '').strip()
	volume = data.get('Volume', '').strip()
	pages = data.get('Pages', '').strip()
	doi = data.get('Doi', '').strip()

	# Remove "doi: " prefix if present
	doi = doi.replace('doi: ', '')

	bibtex = "@article{idnothing,\n"
	if author: bibtex += f" author = {{{author}}},\n"
	if title: bibtex += f" title = {{{title}}},\n"
	if journal: bibtex += f" journal = {{{journal}}},\n"
	if year: bibtex += f" year = {{{year}}},\n"
	if volume: bibtex += f" volume = {{{volume}}},\n"
	if pages: bibtex += f" pages = {{{pages}}},\n"
	if doi: bibtex += f" doi = {{{doi}}},\n"
	bibtex += "}"

	return bibtex

	These changes should result in a more complete and accurate BibTeX entry. The fields will only be included if they have content, and all the information from the input should now be properly captured.

	If you're still experiencing issues after making these changes, please provide an example of the input text you're using and the output you're getting. This will help me further diagnose and resolve any remaining problems.
	Claude does not have the ability to run the code it generates yet.
	Claude can make mistakes. Please double-check responses.




	def transform_chunks(marianne_segmentation):
	marianne_segmentation = pd.DataFrame(marianne_segmentation)
	marianne_segmentation = marianne_segmentation[marianne_segmentation['entity_group'] != 'separator']
	marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).str.replace('¶', '\n', regex=False)
	marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).apply(preprocess_text)
	marianne_segmentation = marianne_segmentation[marianne_segmentation['word'].notna() & (marianne_segmentation['word'] != '') & (marianne_segmentation['word'] != ' ')]

	html_output = []
	bibtex_data = {}
	current_entity = None

	for _, row in marianne_segmentation.iterrows():
	entity_group = row['entity_group']
	result_entity = "[" + entity_group.capitalize() + "]"
	word = row['word']

	if entity_group in ['Author', 'Title', 'Journal', 'Pages', 'Doi']:
	if entity_group in bibtex_data:
	bibtex_data[entity_group] += ' ' + word
	else:
	bibtex_data[entity_group] = word
	current_entity = entity_group
	elif entity_group == 'None':
	if current_entity:
	bibtex_data[current_entity] += ' ' + word
	else:
	bibtex_data['None'] = bibtex_data.get('None', '') + ' ' + word

	html_output.append(f'<div class="manuscript"><div class="annotation">{result_entity}</div><div class="content">{word}</div></div>')

	# Extract year from the 'None' field if present
	none_content = bibtex_data.get('None', '')
	year_match = re.search(r'\((\d{4})\)', none_content)
	if year_match:
	bibtex_data['Year'] = year_match.group(1)

	# Extract volume from the 'None' field if present
	volume_match = re.search(r',\s*(\d+),', none_content)
	if volume_match:
	bibtex_data['Volume'] = volume_match.group(1)

	bibtex_entry = create_bibtex_entry(bibtex_data)

	final_html = '\n'.join(html_output)
	return final_html, bibtex_entry


	# Class to encapsulate the Falcon chatbot
	class MistralChatBot:
	def __init__(self, system_prompt="Le dialogue suivant est une conversation"):
	self.system_prompt = system_prompt

	def predict(self, user_message):
	editorial_text = re.sub("\n", " ¶ ", user_message)
	num_tokens = len(tokenizer.tokenize(editorial_text))

	if num_tokens > 500:
	batch_prompts = split_text(editorial_text, max_tokens=500)
	else:
	batch_prompts = [editorial_text]

	out = token_classifier(batch_prompts)
	classified_list = []
	for classification in out:
	df = pd.DataFrame(classification)
	classified_list.append(df)

	classified_list = pd.concat(classified_list)
	html_output, bibtex_entry = transform_chunks(classified_list)
	generated_text = f'{css}<h2 style="text-align:center">Edited text</h2>\n<div class="generation">{html_output}</div>'
	return generated_text, bibtex_entry

	# Create the Falcon chatbot instance
	mistral_bot = MistralChatBot()

	# Define the Gradio interface
	title = "Éditorialisation"
	description = "Un outil expérimental d'identification de la structure du texte à partir d'un encoder (Deberta)"
	examples = [
	[
	"Qui peut bénéficier de l'AIP?", # user_message
	0.7 # temperature
	]
	]

	demo = gr.Blocks()

	with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo:
	gr.HTML("""<h1 style="text-align:center">Reversed Zotero</h1>""")
	text_input = gr.Textbox(label="Your text", type="text", lines=1)
	text_button = gr.Button("Extract a structured bibtex")
	text_output = gr.HTML(label="Metadata")
	bibtex_output = gr.Textbox(label="BibTeX Entry", lines=10)
	text_button.click(mistral_bot.predict, inputs=text_input, outputs=[text_output, bibtex_output])

	if __name__ == "__main__":
	demo.queue().launch()