Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
from pathlib import Path | |
from zipfile import ZipFile | |
import io | |
import contextlib | |
import requests | |
import random | |
from functools import lru_cache | |
import plotly.express as px | |
FORBIDDEN_NAMES ={"Judas", | |
"Judas Iscariot" | |
"Maher-shalal-hash-baz", | |
"Bathsheba", | |
"Jephthah", | |
"Jehoshaphat", | |
"Tiebreaker", | |
"Boanerges", | |
"Jezebel", | |
"Gomorrah", | |
"Hymenaeus", | |
"Herod", | |
"Pilate", | |
"Doeg", | |
"Ziph", | |
"Phygelus", | |
"Hermogenes", | |
"Philetus", | |
"Balaam", | |
"Achan", | |
"Caiaphas", | |
"Pontius", | |
"Ahab", | |
"Manasseh", | |
"Rehoboam", | |
"Nebuchadnezzar", | |
"Delilah", | |
"Lo-ammi", | |
"Lo-ruhamah", | |
"Beelzebub", | |
"Ichabod", | |
"Saphira", | |
"Jushab-hesed", | |
"Benjarman", | |
"Cain", | |
"Esau", | |
"Machiavelli", # found | |
"Barabbas", | |
"Sapphira", | |
"Shur", | |
"Pontius Pilate" | |
} | |
# --- File download & setup --- | |
def download_file(url: str, dest_path: Path): | |
if dest_path.exists(): | |
print(f"{dest_path.name} already exists. Skipping download.") | |
return | |
print(f"Downloading {url}") | |
response = requests.get(url) | |
response.raise_for_status() | |
with open(dest_path, "wb") as f: | |
f.write(response.content) | |
print(f"Saved to {dest_path}") | |
def extract_names_zip(): | |
zip_path = Path("names.zip") | |
if not zip_path.exists(): | |
raise FileNotFoundError("names.zip not found. Please upload it manually to the repo.") | |
with ZipFile(zip_path, 'r') as zip_ref: | |
zip_ref.extractall(".") | |
print("Unzipped names.zip") | |
extract_names_zip() | |
# Download Bible CSVs if missing | |
download_file( | |
"https://raw.githubusercontent.com/BradyStephenson/bible-data/refs/heads/main/BibleData-Person.csv", | |
Path("BibleData-Person.csv"), | |
) | |
download_file( | |
"https://raw.githubusercontent.com/BradyStephenson/bible-data/refs/heads/main/BibleData-PersonLabel.csv", | |
Path("BibleData-PersonLabel.csv"), | |
) | |
# --- Load datasets --- | |
ssa_name_txt_files = sorted(Path(".").glob("yob*.txt")) | |
def load_all_ssa_names(): | |
dfs = [] | |
for f in ssa_name_txt_files: | |
year = int(f.stem.replace("yob", "")) | |
df = pd.read_csv(f, names=["name", "sex", "count"]) | |
df["year"] = year | |
dfs.append(df) | |
full_df = pd.concat(dfs, ignore_index=True) | |
return full_df | |
def load_ssa_names(min_year=0, max_year=9999): | |
full_df = load_all_ssa_names() | |
filtered_df = full_df[(full_df["year"] >= min_year) & (full_df["year"] <= max_year)] | |
if filtered_df.empty: | |
return pd.DataFrame(), pd.DataFrame() | |
agg_df = ( | |
filtered_df | |
.groupby(["name", "sex"], as_index=False)["count"] | |
.sum() | |
.sort_values("count", ascending=False) | |
) | |
return filtered_df, agg_df | |
def load_bible_names(): | |
bible_names_df = pd.read_csv("BibleData-Person.csv") | |
bible_names_personlabel_df = pd.read_csv("BibleData-PersonLabel.csv") | |
bible_names_personlabel_df = bible_names_personlabel_df.merge(bible_names_df[["person_id", "sex"]], on="person_id", how="left") | |
bible_names_personlabel_df = bible_names_personlabel_df[bible_names_personlabel_df["label_type"] == "proper name"] | |
bible_names_personlabel_df["sex"] = bible_names_personlabel_df["sex"].replace({"male": "M", "female": "F"}) | |
return bible_names_personlabel_df | |
bible_names_personlabel_df = load_bible_names() | |
# --- Name generation logic --- | |
last_names = ["Smith", "Johnson", "Williams", "Taylor", "Brown"] | |
def get_normal_and_bible( | |
# ssa_names_aggregated_df, | |
bible_names_df, | |
min_length_ssa=3, | |
max_length_ssa=8, | |
min_length_bible=3, | |
max_length_bible=8, | |
min_year_ssa=0, | |
max_year_ssa=9999, | |
ssa_popularity_percentile=(0.95, 1.0), | |
sex=None, | |
forbidden_names=None, | |
ssa_names_col="name", | |
bible_names_col="english_label", | |
debug=False, | |
): | |
if forbidden_names is None: | |
forbidden_names = set() | |
ssa_names_df, ssa_names_aggregated_df = load_ssa_names(min_year=min_year_ssa, max_year=max_year_ssa) | |
if debug: | |
print(f"There are {len(ssa_names_aggregated_df)} SSA names from the years {min_year_ssa} to {max_year_ssa}") | |
filtered_ssa = ssa_names_aggregated_df.copy() | |
filtered_ssa = filtered_ssa[~filtered_ssa[ssa_names_col].isin(forbidden_names)] | |
if debug: | |
print(f"SSA names after FORBIDDEN NAMES filter: {len(filtered_ssa)}") | |
filtered_ssa = filtered_ssa[ | |
filtered_ssa[ssa_names_col].str.len().between(min_length_ssa, max_length_ssa) | |
] | |
if sex: | |
filtered_ssa = filtered_ssa[filtered_ssa["sex"] == sex] | |
if debug: | |
print(f"SSA names after length/sex filter: {len(filtered_ssa)}") | |
total = len(filtered_ssa) | |
filtered_ssa = filtered_ssa.sort_values("count") | |
low, high = ssa_popularity_percentile | |
idx_start = int(total * low) | |
idx_end = int(total * high) | |
filtered_ssa = filtered_ssa.iloc[idx_start:idx_end] | |
if debug: | |
print(f"SSA names after popularity percentile slice: {len(filtered_ssa)}") | |
ssa_name = filtered_ssa.sample(1)[ssa_names_col].values[0] | |
# ------------ | |
# Bible names | |
filtered_bible = bible_names_df.copy() | |
if debug: | |
print(f"Bible names before filtering: {len(filtered_bible)}") | |
filtered_bible = filtered_bible[ | |
filtered_bible[bible_names_col].str.len().between(min_length_bible, max_length_bible) | |
] | |
if debug: | |
print(f"Bible names after lengthfiltering: {len(filtered_bible)}") | |
if sex: | |
filtered_bible = filtered_bible[filtered_bible["sex"] == sex] | |
if debug: | |
print(f"Bible names after sex filtering: {len(filtered_bible)}") | |
filtered_bible = filtered_bible[~filtered_bible[bible_names_col].isin(forbidden_names)] | |
if debug: | |
print(f"Bible names after FORBIDDEN NAMES filtering: {len(filtered_bible)}") | |
if len(filtered_bible) == 0 or len(filtered_ssa) == 0: | |
raise ValueError("No valid names found after filtering.") | |
bible_name = filtered_bible.sample(1)[bible_names_col].values[0] | |
return ssa_name, bible_name | |
# -------------------- Plotting --- | |
import plotly.graph_objects as go | |
def plot_name_trends_plotly(df, names, start_year=None, end_year=None, logscale=False): | |
name_df = df[df["name"].isin(names)] | |
if start_year is not None: | |
name_df = name_df[name_df["year"] >= start_year] | |
if end_year is not None: | |
name_df = name_df[name_df["year"] <= end_year] | |
if name_df.empty: | |
raise gr.Error("No data for selected names and year range.") | |
agg_df = ( | |
name_df.groupby(["year", "name"])["count"] | |
.sum() | |
.reset_index() | |
) | |
# Build figure manually for better control | |
fig = go.Figure() | |
for name in sorted(agg_df["name"].unique()): | |
sub_df = agg_df[agg_df["name"] == name] | |
if len(sub_df) > 1: | |
fig.add_trace(go.Scatter( | |
x=sub_df["year"], | |
y=sub_df["count"], | |
mode="lines+markers", | |
name=name | |
)) | |
else: | |
# Jessca | |
fig.add_trace(go.Scatter( | |
x=sub_df["year"], | |
y=sub_df["count"], | |
mode="markers", | |
name=name, | |
marker=dict(size=10, symbol="circle"), | |
)) | |
fig.update_layout( | |
title="Name Usage Over Time", | |
xaxis_title="Year", | |
yaxis_title="Count", | |
height=500, | |
yaxis_type="log" if logscale else "linear", | |
) | |
return fig, agg_df | |
def plot_from_inputs(name_text, start_year, end_year, logscale): | |
names = [n.strip() for n in name_text.split(",") if n.strip()] | |
if not names: | |
raise gr.Error("Please enter at least one name.") | |
full_df = load_all_ssa_names() | |
return plot_name_trends_plotly(full_df, names, start_year, end_year, logscale) | |
# --- Gradio app --- | |
def generate_names(n, sex, ssa_min_len, ssa_max_len, | |
ssa_min_year, | |
ssa_max_year, | |
min_bible_len, max_bible_len, pop_low, pop_high, debug_flag, last, forbidden_names_text, bible_names_first_flag): | |
results = [] | |
debug_output = io.StringIO() | |
forbidden_names = set(name.strip() for name in forbidden_names_text.split(",") if name.strip()) | |
with contextlib.redirect_stdout(debug_output): | |
for i in range(n): | |
try: | |
normal, bible = get_normal_and_bible( | |
bible_names_personlabel_df, | |
min_length_ssa=ssa_min_len, | |
max_length_ssa=ssa_max_len, | |
min_year_ssa=ssa_min_year, | |
max_year_ssa=ssa_max_year, | |
min_length_bible=min_bible_len, | |
max_length_bible=max_bible_len, | |
ssa_popularity_percentile=(pop_low, pop_high), | |
sex=sex if sex in {"M", "F"} else None, | |
forbidden_names=forbidden_names, | |
debug=(i==0 and debug_flag), | |
) | |
if last is None: | |
last = random.choice(last_names) | |
if bible_names_first_flag: | |
first = bible | |
middle = normal | |
else: | |
first=normal | |
middle = bible | |
results.append(f"{first} {middle} {last}") | |
except Exception as e: | |
results.append(f"[Error: {e}]") | |
return "\n".join(results), debug_output.getvalue() | |
with gr.Blocks() as demo: | |
with gr.Tabs(): | |
with gr.Tab("🔀 Generate Names"): | |
gr.Markdown("# 📜 Random Bible + SSA Name Generator") | |
with gr.Row(): | |
n_slider = gr.Slider(1, 20, value=5, step=1, label="How many names?") | |
sex_choice = gr.Radio(["M", "F", "Any"], label="Sex", value="Any") | |
with gr.Row(): | |
ssa_min_len = gr.Slider(1, 40, value=1, step=1, label="SSA name min length") | |
ssa_max_len = gr.Slider(1, 40, value=40, step=1, label="SSA name max length") | |
with gr.Row(): | |
ssa_min_year = gr.Slider(1880, 2024, value=1880, step=1, label="SSA name min year") | |
ssa_max_year = gr.Slider(1880, 2024, value=2024, step=1, label="SSA name max year") | |
with gr.Row(): | |
bible_len = gr.Slider(1, 40, value=1, step=1, label="Bible name min length") | |
bible_max_len = gr.Slider(1, 40, value=40, step=1, label="Bible name max length") | |
with gr.Row(): | |
pop_low_slider = gr.Slider(0.0, 1.0, value=0.95, step=0.01, label="SSA Popularity: Low Percentile") | |
pop_high_slider = gr.Slider(0.0, 1.0, value=1.0, step=0.01, label="SSA Popularity: High Percentile") | |
with gr.Row(): | |
last_name_input = gr.Textbox(label="Last Name") | |
with gr.Row(): | |
forbidden_names_input = gr.Textbox(label="FORBIDDEN NAMES (comma-separated)", value=",".join(FORBIDDEN_NAMES)) | |
debug_checkbox = gr.Checkbox(label="Show debug output", value=True) | |
bible_name_first_checkbox = gr.Checkbox(label="Bible name first?", value=True) | |
generate_btn = gr.Button("🔀 Generate Names") | |
output_box = gr.Textbox(label="Generated Names", lines=10) | |
debug_box = gr.Textbox(label="Debug Output", lines=10) | |
generate_btn.click( | |
fn=generate_names, | |
inputs=[ | |
n_slider, | |
sex_choice, | |
ssa_min_len, | |
ssa_max_len, | |
ssa_min_year, | |
ssa_max_year, | |
bible_len, | |
bible_max_len, | |
pop_low_slider, | |
pop_high_slider, | |
debug_checkbox, | |
last_name_input, | |
forbidden_names_input, | |
bible_name_first_checkbox, | |
], | |
outputs=[output_box, debug_box], | |
) | |
with gr.Tab("📈 Name Trends"): | |
gr.Markdown("# 📈 SSA Name Trends Over Time") | |
with gr.Row(): | |
trend_names_input = gr.Textbox(label="Name(s) to plot (comma-separated)", placeholder="e.g. Zebediah, Remington, Jessca, Jielle") | |
with gr.Row(): | |
trend_start_year = gr.Slider(1880, 2024, value=1950, step=1, label="Start Year") | |
trend_end_year = gr.Slider(1880, 2024, value=2024, step=1, label="End Year") | |
trend_logscale = gr.Checkbox(label="Log scale?", value=False) | |
plot_button = gr.Button("📊 Plot Trends") | |
plot_output = gr.Plot(label="Trend Plot") | |
table_output = gr.Dataframe(label="Underlying Data") | |
plot_button.click( | |
fn=plot_from_inputs, | |
inputs=[trend_names_input, trend_start_year, trend_end_year, trend_logscale], | |
outputs=[plot_output,table_output], | |
) | |
demo.launch() | |