import gradio as gr import pandas as pd import re from huggingface_hub import InferenceClient import spacy from collections import Counter import plotly.express as px import plotly.graph_objects as go from datetime import datetime # Load SpaCy model for NLP nlp = spacy.load("en_core_web_sm") # Initialize Hugging Face client client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") def parse_message(message): """Extract information from a chat message using regex and NLP.""" info = {} # Extract timestamp and phone number timestamp_match = re.search(r'\[(.*?)\]', message) phone_match = re.search(r'\] (.*?):', message) if timestamp_match and phone_match: info['timestamp'] = timestamp_match.group(1) info['phone'] = phone_match.group(1) # Extract rest of the message content = message.split(':', 1)[1].strip() # Extract name name_match = re.match(r'^([^•\n-]+)', content) if name_match: info['name'] = name_match.group(1).strip() # Extract affiliation affiliation_match = re.search(r'[Aa]ffiliation:?\s*([^•\n]+)', content) if affiliation_match: info['affiliation'] = affiliation_match.group(1).strip() # Extract research field/interests field_match = re.search(r'([Ff]ield of [Ii]nterest|[Dd]omaine de recherche|[Rr]esearch area|[Aa]reas of interest):?\s*([^•\n]+)', content) if field_match: info['research_field'] = field_match.group(2).strip() # Extract thesis topic thesis_match = re.search(r'[Tt]hesis:?\s*([^•\n]+)', content) if thesis_match: info['thesis_topic'] = thesis_match.group(1).strip() # Extract LinkedIn URL linkedin_match = re.search(r'https?://(?:www\.)?linkedin\.com\S+', content) if linkedin_match: info['linkedin'] = linkedin_match.group(0) return info def create_researcher_df(chat_history): """Convert chat messages to structured DataFrame.""" researchers = [] messages = chat_history.split('\n') for message in messages: if message.strip(): info = parse_message(message) if info: researchers.append(info) df = pd.DataFrame(researchers) return df def analyze_research_fields(df): """Analyze and categorize research fields.""" if 'research_field' not in df.columns: return pd.Series() fields = df['research_field'].dropna() # Split fields and flatten all_fields = [field.strip().lower() for fields_list in fields for field in fields_list.split(',')] return pd.Series(Counter(all_fields)) def create_visualizations(df): """Create visualizations from the researcher data.""" figures = [] # 1. Affiliation Distribution if 'affiliation' in df.columns: affiliation_counts = df['affiliation'].value_counts() fig_affiliation = px.pie( values=affiliation_counts.values, names=affiliation_counts.index, title='Distribution of Researchers by Affiliation' ) figures.append(fig_affiliation) # 2. Research Fields Analysis field_counts = analyze_research_fields(df) if not field_counts.empty: fig_fields = px.bar( x=field_counts.index, y=field_counts.values, title='Popular Research Fields', labels={'x': 'Field', 'y': 'Count'} ) figures.append(fig_fields) return figures def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, chat_history_text="" ): """Enhanced response function with data analysis capabilities.""" # Process chat history if provided if chat_history_text: df = create_researcher_df(chat_history_text) # Generate analysis summary summary = f"Analysis of {len(df)} researchers:\n" if 'affiliation' in df.columns: summary += f"- Institutions represented: {df['affiliation'].nunique()}\n" field_counts = analyze_research_fields(df) if not field_counts.empty: top_fields = field_counts.nlargest(3) summary += "- Top research fields:\n" for field, count in top_fields.items(): summary += f" • {field}: {count} researchers\n" # Create visualizations figures = create_visualizations(df) # Add analysis to message message += f"\n\nCommunity Analysis:\n{summary}" # Generate response using the LLM messages = [{"role": "system", "content": system_message}] for val in history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) response = "" for token in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token_content = token.choices[0].delta.content response += token_content yield response # Create enhanced Gradio interface demo = gr.Interface( fn=respond, inputs=[ gr.Textbox(label="Message"), gr.State([]), # history gr.Textbox(value="You are a friendly Research Community Chatbot.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"), gr.Textbox(label="Chat History", lines=10) ], outputs=[ gr.Textbox(label="Response"), gr.Plot(label="Community Analysis") ], title="Research Community Analyzer", description="An enhanced chatbot that analyzes research community data and provides visualizations." ) if __name__ == "__main__": demo.launch()