File size: 6,195 Bytes
57c87c9
 
 
 
404478b
57c87c9
 
 
 
404478b
 
 
 
 
 
 
 
4dd059d
404478b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b58eec2
 
 
 
 
 
 
 
 
404478b
b58eec2
 
404478b
 
 
 
 
4dd059d
 
 
 
 
 
 
 
404478b
 
57c87c9
b58eec2
57c87c9
570845b
 
 
 
404478b
 
4dd059d
 
404478b
 
 
 
570845b
404478b
570845b
 
 
 
 
 
 
 
 
 
 
 
 
 
b58eec2
570845b
404478b
570845b
 
 
404478b
 
 
 
 
 
 
570845b
404478b
570845b
404478b
570845b
 
 
404478b
 
 
 
 
 
 
 
 
 
 
 
 
 
570845b
b58eec2
570845b
404478b
570845b
 
404478b
 
570845b
 
 
57c87c9
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
from datetime import datetime

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt

# from load_dataframe import get_data


def aggregated_data(df, aggregation_level="week"):

    st.write(f"Aggregated data by {aggregation_level}")

    # Create a column that indicates if a paper has any artifacts
    df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)

    # Resample by week
    freq = 'W' if aggregation_level == "week" else 'ME'
    weekly_total_papers = df.resample(freq).size()
    weekly_papers_with_artifacts = df.resample(freq)['has_artifact'].sum()

    # Calculate the percentage of papers with artifacts
    percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100

    # Create the plot
    plt.figure(figsize=(12, 6))
    plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')

    # Set the y-axis limits
    plt.ylim(0, 100)
    
    plt.xlabel(aggregation_level)
    plt.ylabel('Percentage')
    plt.title('Percentage of Papers with Artifacts (Models, Datasets, Spaces) Over Time')
    plt.legend()
    plt.grid(True)

    # Use Streamlit to display the plot
    st.pyplot(plt)


def display_data(df):
    num_artifacts = df[(df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)].shape[0]

    st.markdown(f"""
    ## Number of papers: {df.shape[0]}
    #### Number of papers with a Github link: {df['github'].notnull().sum()}
    #### Number of papers with at least one HF artifact: {num_artifacts}
    """)

    st.write("Papers with at least one artifact")
    df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)
    st.dataframe(df[df['has_artifact']],
                hide_index=True,
                column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
                column_config={"github": st.column_config.LinkColumn(),
                                "paper_page": st.column_config.LinkColumn()},
                width=2000)
    
    st.write("Papers without artifacts")
    st.dataframe(df[~df['has_artifact']],
                hide_index=True,
                column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
                column_config={"github": st.column_config.LinkColumn(),
                                "paper_page": st.column_config.LinkColumn()},
                width=2000)
    
    st.write("Papers with a HF mention in README but no artifacts")
    st.dataframe(df[(df['hf_mention'] == 1) & (~df['has_artifact'])],
                hide_index=True,
                column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
                column_config={"github": st.column_config.LinkColumn(),
                                "paper_page": st.column_config.LinkColumn()},
                width=2000)


def main():
    st.title("Hugging Face Artifacts KPI Dashboard")

    # 2 tabs: one for daily data, one for weekly data
    st.sidebar.title("Navigation")
    selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])

    # TODO use this instead
    # df = get_data()
    df = pd.read_csv('/Users/nielsrogge/Downloads/daily_papers_enriched (3).csv')
    df = df.drop(['Unnamed: 0'], axis=1) if 'Unnamed: 0' in df.columns else df
    # Use date as index
    df = df.set_index('date')
    df.index = pd.to_datetime(df.index)
    df = df.sort_index()

    if selection == "Daily/weekly/monthly data":
        # Button to select day, month or week
        # Add streamlit selectbox.
        view_level = st.selectbox(label="View data per day, week or month", options=["day", "week", "month"])

        if view_level == "day":
            # make a button to select the day, defaulting to today
            day = st.date_input("Select day", value="today", format="DD/MM/YYYY")
            # convert to the day of a Pandas Timestamp
            day = pd.Timestamp(day)

            print("Day:", day)

            df = df[df.index.date == day.date()]

            st.write(f"Showing data for {day.day_name()} {day.strftime('%d/%m/%Y')}")

            display_data(df)

        elif view_level == "week":
            # make a button to select the week
            week_number = st.number_input("Select week", value=datetime.today().isocalendar()[1], min_value=1, max_value=52)
            
            # Extract week number from the index
            df['week'] = df.index.isocalendar().week

            # Filter the dataframe for the desired week number
            df = df[df['week'] == week_number]
            
            st.write(f"Showing data for week {week_number}")
            
            display_data(df)

        elif view_level == "month":
            # make a button to select the month, defaulting to current month
            month_str = st.selectbox("Select month", options=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
            year_str = st.selectbox("Select year", options=["2024"])
            
            # Filter the dataframe for the desired week number
            month_map = {
                'January': 1, 'February': 2, 'March': 3, 'April': 4, 
                'May': 5, 'June': 6, 'July': 7, 'August': 8, 
                'September': 9, 'October': 10, 'November': 11, 'December': 12
            }
            
            # Convert month string to number
            month = month_map[month_str]
            year = int(year_str)
            df = df[(df.index.month == month) & (df.index.year == year)]
            
            st.write(f"Showing data for {month_str} {year_str}")
            
            display_data(df)

    elif selection == "Aggregated data":
        aggregated_data(df)
        aggregated_data(df, aggregation_level="month")

    else:
        st.write("Error: selection not recognized")

    # Display data based on aggregation level



if __name__ == "__main__":
    main()