File size: 7,061 Bytes
0472866
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from lifelines import KaplanMeierFitter
from matplotlib.colors import LinearSegmentedColormap
from utils import save_plot


def load_and_prepare_data(file_path):
    """
    Load CSV data, convert 'month_year' to datetime, and prepare cohort and duration calculations.
    Filter data to include only entries from 2021 onwards and adjust the cohort calculation based on the first active month.
    Additionally, eliminate all months with a negative 'Order' so we only get the months after the cohort of the individual.
    """
    df = pd.read_csv(file_path)
    df["month_year"] = pd.to_datetime(df["month_year"], format="%B_%Y")
    df = df[df["month_year"] >= "2021-09-01"]
    df["Active"] = df["total_commits"] > 0
    df.sort_values(by=["developer", "month_year"], inplace=True)

    first_active_month = (
        df[df["Active"]].groupby("developer")["month_year"].min().reset_index()
    )
    first_active_month.rename(columns={"month_year": "FirstActiveMonth"}, inplace=True)

    df = df.merge(first_active_month, on="developer", how="left")

    df["Cohort"] = df["FirstActiveMonth"].dt.to_period("M")

    def calculate_order(row):
        if pd.isnull(row["Cohort"]):
            return None
        return (row["month_year"].to_period("M") - row["Cohort"]).n

    df["Order"] = df.apply(calculate_order, axis=1)

    df = df[df["Order"] >= 0]
    df["Inactive_Month"] = df.groupby("developer")["Active"].transform(
        lambda x: x.rolling(window=2, min_periods=2).sum() == 0
    )
    df["inactive_for_two_months"] = (
        df.groupby("developer")["Inactive_Month"].transform("max").astype(int)
    )

    df["duration"] = df.groupby("developer")["month_year"].transform("nunique")
    df.to_csv("debug.csv", index=False)

    return df


def visualize_developer_retention(df):
    cohort_counts = (
        df[~df["Inactive_Month"]]
        .groupby(["Cohort", "Order"])
        .developer.nunique()
        .unstack(0)
    )

    cohort_sizes = cohort_counts.iloc[0]
    retention = cohort_counts.divide(cohort_sizes, axis=1)

    colors = [(0, "#FF0000"), (0.15, "#FFA500"), (0.2, "#FFFF00"), (1, "#008000")]
    cmap = LinearSegmentedColormap.from_list("custom_cmap", colors, N=256)
    plt.figure(figsize=(12, 8))  # Adjusted figure size for better visibility
    sns.heatmap(retention.T, annot=False, cmap=cmap)
    plt.title("Journey Through Code: Tracking Developer Engagement Over Time", pad=20)

    plt.subplots_adjust(bottom=0.3)

    description_text = (
        "This heatmap visualizes the engagement journey of developers, tracked monthly across cohorts."
        " Each cohort represents developers who began contributing in the same month."
        " The color gradient from red to green signifies the evolution of active engagement over time,"
        " with red indicating lower engagement levels and green denoting higher activity."
        " Cohorts are plotted on the y-axis, and the actual months since the start of the cohort on the x-axis."
        " This visualization offers insights into how developer activity trends evolve,"
        " highlighting periods of increased or decreased engagement and aiding in understanding"
        " the effectiveness of retention strategies over time."
        " Parameters:"
        "(a) A developer is considered inactive if they have at least 2 continuous inactive months."
        "(b) With one commit in a month, the developer is considered active."
        "(c) The data is filtered to include only entries from September 2021 onwards."
    )
    plt.figtext(0.5, -0.0001, description_text, ha="center", fontsize=9, wrap=True)

    save_plot(plt, "developer_engagement_journey")


def survival_curve_analysis_and_plot(df):
    """
    Perform analysis on the DataFrame to calculate durations and generate visualizations, with annotations explaining the analysis.
    Adjust the event definition and perform Log-Rank Test.
    """
    summary_df = (
        df.groupby("developer")
        .agg({"duration": "first", "inactive_for_two_months": "last"})
        .reset_index()
    )

    kmf = KaplanMeierFitter()
    kmf.fit(
        durations=summary_df["duration"],
        event_observed=summary_df["inactive_for_two_months"],
        label="Developer Survival Probability",
    )

    plt.figure(figsize=(10, 6))
    ax = plt.subplot(111)
    kmf.plot_survival_function(ax=ax)

    plt.title("Developer Survival Curve: Probability of Active Contribution Over Time")
    plt.grid(True, which="both", linestyle="--", linewidth=0.5)
    median_survival_time = kmf.median_survival_time_
    ax.axhline(y=0.5, color="red", linestyle="--")
    ax.text(
        median_survival_time,
        0.48,
        "Median Survival Time",
        verticalalignment="center",
        color="red",
        fontsize=8,
    )
    ax.axvline(x=3, color="green", linestyle="--")
    ax.text(
        3,
        0.95,
        "Inactive Month + 1",
        verticalalignment="top",
        horizontalalignment="center",
        color="green",
        fontsize=8,
    )
    ax.axvline(x=median_survival_time, color="green", linestyle="--")
    ax.text(
        len(df["duration"].unique()),
        0.9,
        f"After month {int(median_survival_time)} the probability of developers staying is lower than 50 percent",
        verticalalignment="top",
        horizontalalignment="right",
        color="green",
        fontsize=8,
    )
    ax.set_yticks(np.arange(0, 1.1, 0.1))

    # Setting the x-axis and y-axis labels as per the request
    plt.xlabel("Months since the developer started committing code")
    plt.ylabel("Probability of a developer staying in the ecosystem")

    description_text = (
        "The Kaplan-Meier survival curve shows the probability of developers continuing to contribute over time."
        "Parameters:"
        "(a) A developer is consider as inactive if they have at least 2 continuous inactive months."
        "(b) With one commit in a month, the developer is considered active."
        "(c) The data is filtered to include only entries from September 2021 onwards."
        "The Kaplan-Meier estimator is a non-parametric statistic used to estimate the survival function from lifetime data."
        "It requires to know  the duration each subject was observed for, and whether the event of interest"
        "(in this case, becoming inactive for two months) was observed."
        "The 'Median Survival Time' shows when the chance of further contributions drops below 50%. "
        "This analysis helps in understanding the retention of developers and predicting future contribution patterns."
    )
    plt.figtext(0.1, -0.1, description_text, ha="left", fontsize=8, wrap=True)

    save_plot(plt, "developer_survival_curve")


if __name__ == "__main__":
    csv_path = "data/source/all_networks_developer_classification.csv"
    df = load_and_prepare_data(csv_path)

    visualize_developer_retention(df)

    survival_curve_analysis_and_plot(df)