import transformers import re from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline import torch import gradio as gr import json import os import shutil import requests import pandas as pd # Define the device device = "cuda" if torch.cuda.is_available() else "cpu" editorial_model = "PleIAs/Bibliography-Formatter" token_classifier = pipeline( "token-classification", model=editorial_model, aggregation_strategy="simple", device=device ) tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512) css = """ """ # Preprocess the 'word' column def preprocess_text(text): # Remove HTML tags text = re.sub(r'<[^>]+>', '', text) # Replace newlines with spaces text = re.sub(r'\n', ' ', text) # Replace multiple spaces with a single space text = re.sub(r'\s+', ' ', text) # Strip leading and trailing whitespace return text.strip() def split_text(text, max_tokens=500): # Split the text by newline characters parts = text.split("\n") chunks = [] current_chunk = "" for part in parts: # Add part to current chunk if current_chunk: temp_chunk = current_chunk + "\n" + part else: temp_chunk = part # Tokenize the temporary chunk num_tokens = len(tokenizer.tokenize(temp_chunk)) if num_tokens <= max_tokens: current_chunk = temp_chunk else: if current_chunk: chunks.append(current_chunk) current_chunk = part if current_chunk: chunks.append(current_chunk) # If no newlines were found and still exceeding max_tokens, split further if len(chunks) == 1 and len(tokenizer.tokenize(chunks[0])) > max_tokens: long_text = chunks[0] chunks = [] while len(tokenizer.tokenize(long_text)) > max_tokens: split_point = len(long_text) // 2 while split_point < len(long_text) and not re.match(r'\s', long_text[split_point]): split_point += 1 # Ensure split_point does not go out of range if split_point >= len(long_text): split_point = len(long_text) - 1 chunks.append(long_text[:split_point].strip()) long_text = long_text[split_point:].strip() if long_text: chunks.append(long_text) return chunks def create_bibtex_entry(data): author = data.get('Author', '').strip() title = data.get('Title', '').strip() journal = data.get('Journal', '').strip() year = data.get('Year', '').strip() volume = data.get('Volume', '').strip() pages = data.get('Pages', '').strip() doi = data.get('Doi', '').strip() # Remove "doi: " prefix if present doi = doi.replace('doi: ', '') bibtex = "@article{idnothing,\n" if author: bibtex += f" author = {{{author}}},\n" if title: bibtex += f" title = {{{title}}},\n" if journal: bibtex += f" journal = {{{journal}}},\n" if year: bibtex += f" year = {{{year}}},\n" if volume: bibtex += f" volume = {{{volume}}},\n" if pages: bibtex += f" pages = {{{pages}}},\n" if doi: bibtex += f" doi = {{{doi}}},\n" bibtex += "}" return bibtex These changes should result in a more complete and accurate BibTeX entry. The fields will only be included if they have content, and all the information from the input should now be properly captured. If you're still experiencing issues after making these changes, please provide an example of the input text you're using and the output you're getting. This will help me further diagnose and resolve any remaining problems. Claude does not have the ability to run the code it generates yet. Claude can make mistakes. Please double-check responses. def transform_chunks(marianne_segmentation): marianne_segmentation = pd.DataFrame(marianne_segmentation) marianne_segmentation = marianne_segmentation[marianne_segmentation['entity_group'] != 'separator'] marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).str.replace('¶', '\n', regex=False) marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).apply(preprocess_text) marianne_segmentation = marianne_segmentation[marianne_segmentation['word'].notna() & (marianne_segmentation['word'] != '') & (marianne_segmentation['word'] != ' ')] html_output = [] bibtex_data = {} current_entity = None for _, row in marianne_segmentation.iterrows(): entity_group = row['entity_group'] result_entity = "[" + entity_group.capitalize() + "]" word = row['word'] if entity_group in ['Author', 'Title', 'Journal', 'Pages', 'Doi']: if entity_group in bibtex_data: bibtex_data[entity_group] += ' ' + word else: bibtex_data[entity_group] = word current_entity = entity_group elif entity_group == 'None': if current_entity: bibtex_data[current_entity] += ' ' + word else: bibtex_data['None'] = bibtex_data.get('None', '') + ' ' + word html_output.append(f'

{result_entity}

{word}

') # Extract year from the 'None' field if present none_content = bibtex_data.get('None', '') year_match = re.search(r'\((\d{4})\)', none_content) if year_match: bibtex_data['Year'] = year_match.group(1) # Extract volume from the 'None' field if present volume_match = re.search(r',\s*(\d+),', none_content) if volume_match: bibtex_data['Volume'] = volume_match.group(1) bibtex_entry = create_bibtex_entry(bibtex_data) final_html = '\n'.join(html_output) return final_html, bibtex_entry # Class to encapsulate the Falcon chatbot class MistralChatBot: def __init__(self, system_prompt="Le dialogue suivant est une conversation"): self.system_prompt = system_prompt def predict(self, user_message): editorial_text = re.sub("\n", " ¶ ", user_message) num_tokens = len(tokenizer.tokenize(editorial_text)) if num_tokens > 500: batch_prompts = split_text(editorial_text, max_tokens=500) else: batch_prompts = [editorial_text] out = token_classifier(batch_prompts) classified_list = [] for classification in out: df = pd.DataFrame(classification) classified_list.append(df) classified_list = pd.concat(classified_list) html_output, bibtex_entry = transform_chunks(classified_list) generated_text = f'{css}

Edited text

{html_output}

' return generated_text, bibtex_entry # Create the Falcon chatbot instance mistral_bot = MistralChatBot() # Define the Gradio interface title = "Éditorialisation" description = "Un outil expérimental d'identification de la structure du texte à partir d'un encoder (Deberta)" examples = [ [ "Qui peut bénéficier de l'AIP?", # user_message 0.7 # temperature ] ] demo = gr.Blocks() with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo: gr.HTML("""

Reversed Zotero

""") text_input = gr.Textbox(label="Your text", type="text", lines=1) text_button = gr.Button("Extract a structured bibtex") text_output = gr.HTML(label="Metadata") bibtex_output = gr.Textbox(label="BibTeX Entry", lines=10) text_button.click(mistral_bot.predict, inputs=text_input, outputs=[text_output, bibtex_output]) if __name__ == "__main__": demo.queue().launch()