Spaces:
Sleeping
Sleeping
File size: 6,067 Bytes
3852d49 3683f9c 3852d49 3683f9c 2162f38 3683f9c 3852d49 3683f9c 2162f38 3683f9c 2162f38 3683f9c 2162f38 3852d49 3683f9c 3852d49 3683f9c 2162f38 3683f9c 2162f38 3683f9c 2162f38 3683f9c 2162f38 3852d49 2162f38 3683f9c 3852d49 3683f9c 3852d49 2162f38 3852d49 3683f9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import gradio as gr
import pandas as pd
import re
from huggingface_hub import InferenceClient
import plotly.express as px
from collections import Counter
# Initialize Hugging Face client
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
def parse_message(message):
"""Extract information from a chat message using regex."""
info = {}
# Extract timestamp and phone number
timestamp_match = re.search(r'\[(.*?)\]', message)
phone_match = re.search(r'\] (.*?):', message)
if timestamp_match and phone_match:
info['timestamp'] = timestamp_match.group(1)
info['phone'] = phone_match.group(1)
# Extract rest of the message
content = message.split(':', 1)[1].strip()
# Extract name
name_match = re.match(r'^([^•\n-]+)', content)
if name_match:
info['name'] = name_match.group(1).strip()
# Extract affiliation
affiliation_match = re.search(r'[Aa]ffiliation:?\s*([^•\n]+)', content)
if affiliation_match:
info['affiliation'] = affiliation_match.group(1).strip()
# Extract research field/interests
field_match = re.search(r'([Ff]ield of [Ii]nterest|[Dd]omaine de recherche|[Rr]esearch area|[Aa]reas of interest):?\s*([^•\n]+)', content)
if field_match:
info['research_field'] = field_match.group(2).strip()
# Extract thesis topic
thesis_match = re.search(r'[Tt]hesis:?\s*([^•\n]+)', content)
if thesis_match:
info['thesis_topic'] = thesis_match.group(1).strip()
return info
def create_researcher_df(chat_history):
"""Convert chat messages to structured DataFrame."""
researchers = []
messages = chat_history.split('\n')
for message in messages:
if message.strip():
info = parse_message(message)
if info:
researchers.append(info)
df = pd.DataFrame(researchers)
return df
def analyze_research_fields(df):
"""Analyze and categorize research fields."""
if 'research_field' not in df.columns:
return pd.Series()
fields = df['research_field'].dropna()
# Split fields and flatten
all_fields = [field.strip().lower() for fields_list in fields for field in fields_list.split(',')]
return pd.Series(Counter(all_fields))
def create_visualizations(df):
"""Create visualizations from the researcher data."""
figures = []
# 1. Affiliation Distribution
if 'affiliation' in df.columns and not df['affiliation'].empty:
affiliation_counts = df['affiliation'].value_counts()
fig_affiliation = px.pie(
values=affiliation_counts.values,
names=affiliation_counts.index,
title='Distribution of Researchers by Affiliation'
)
figures.append(fig_affiliation)
# 2. Research Fields Analysis
field_counts = analyze_research_fields(df)
if not field_counts.empty:
fig_fields = px.bar(
x=field_counts.index,
y=field_counts.values,
title='Popular Research Fields',
labels={'x': 'Field', 'y': 'Count'}
)
figures.append(fig_fields)
return figures[0] if figures else None
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
chat_history_text=""
):
"""Enhanced response function with data analysis capabilities."""
try:
# Process chat history if provided
if chat_history_text:
df = create_researcher_df(chat_history_text)
# Generate analysis summary
summary = f"Analysis of {len(df)} researchers:\n"
if 'affiliation' in df.columns:
summary += f"- Institutions represented: {df['affiliation'].nunique()}\n"
field_counts = analyze_research_fields(df)
if not field_counts.empty:
top_fields = field_counts.nlargest(3)
summary += "- Top research fields:\n"
for field, count in top_fields.items():
summary += f" • {field}: {count} researchers\n"
# Add analysis to message
message += f"\n\nCommunity Analysis:\n{summary}"
# Generate response using the LLM
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
response = ""
for token in client.chat_completion(
messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
):
token_content = token.choices[0].delta.content
response += token_content
yield response
except Exception as e:
yield f"Error: {str(e)}"
# Create Gradio interface
demo = gr.Interface(
fn=respond,
inputs=[
gr.Textbox(label="Message"),
gr.State([]), # history
gr.Textbox(value="You are a friendly Research Community Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
gr.Textbox(label="Chat History", lines=10)
],
outputs=[
gr.Textbox(label="Response"),
gr.Plot(label="Community Analysis")
],
title="CohortBot",
description="A chatbot that analyzes research community data and provides visualizations."
)
if __name__ == "__main__":
demo.launch() |