TxT360

Running

App Files Files Community

omkarenator commited on Oct 5, 2024

Commit

0698fac

1 Parent(s): 9a127b5

fix curated page layout. remove fixed width of the chart

Browse files

Files changed (1) hide show

curated.py +39 -44

curated.py CHANGED Viewed

@@ -579,54 +579,49 @@ data_preprocessing_div = Div(
     ),
 )
-# Data for the stacked bar chart
-data = {
-    "Filter": [
-        "Downloaded Lines",
-        "Language Filter",
-        "Min Word Count",
-        "Unigram Log Probability",
-    ],
-    "Wikipedia": [61614907, 61614907, 60468491, 60468491],
-    "Freelaw": [75971288, 73690766, 68171834, 68123174],
-    "DM Maths": [112559888, 112559888, 112559888, 112559888],
-    "USPTO": [6880276, 6878964, 6749922, 6749389],
-    "PG19": [28752, 28683, 28682, 28632],
-    "Hackernews": [2064931, 2010802, 2010488, 2003636],
-    "Ubuntu IRC": [37966, 23501, 23468, 23205],
-    "Europarl": [69814, 69814, 69814, 69814],
-    "StackExchange": [23246548, 23246548, 23246352, 23246352],
-    "Arxiv": [1911867, 1869441, 1763840, 1762661],
-    "S2ORC": [12963563, 12963563, 12963563, 12963563],
-    "S2ORC Abstract": [102324176, 83867601, 82889293, 82777912],
-    "Pubmed Central": [5230932, 4830486, 4768310, 4767474],
-    "Pubmed Abstract": [25787474, 25784374, 25747955, 25746724],
-    "Phil Papers": [49389, 39175, 39175, 39128],
-}
-# Creating a dataframe
-df = pd.DataFrame(data)
-# Creating the stacked bar chart
-fig = go.Figure()
-# Add trace for each dataset
-for dataset in df.columns[1:]:
-    fig.add_trace(go.Bar(name=dataset, x=df["Filter"], y=df[dataset]))
-# Update the layout
-fig.update_layout(
-    barmode="stack",
-    title="Document Reduction by Filter for Each Dataset",
-    xaxis_title="Filter",
-    yaxis_title="Number of Lines",
-    legend_title="Dataset",
-    height=600,
-    width=1000,
-)
-# Show the plot
-diff2_stacked_bar = fig
 filtering_process = Div(
@@ -635,7 +630,7 @@ filtering_process = Div(
         P(
             "Below is a detail recount of how each dataset was extracted and filtered. If specific challenges were found with a dataset, they are included and discussed to the best of our abilities. The figure below provides a global view of the document filtering results. ~8% of documents were removed during these three steps."
         ),
-        plotly2fasthtml(diff2_stacked_bar),
         H3(
             "This section continues below with the specific filtering steps taken for all 14 curated datasets."
         ),

     ),
 )
+def diff2_stacked_bar():
+    # Data for the stacked bar chart
+    data = {
+        "Filter": [
+            "Downloaded Lines",
+            "Language Filter",
+            "Min Word Count",
+            "Unigram Log Probability",
+        ],
+        "Wikipedia": [61614907, 61614907, 60468491, 60468491],
+        "Freelaw": [75971288, 73690766, 68171834, 68123174],
+        "DM Maths": [112559888, 112559888, 112559888, 112559888],
+        "USPTO": [6880276, 6878964, 6749922, 6749389],
+        "PG19": [28752, 28683, 28682, 28632],
+        "Hackernews": [2064931, 2010802, 2010488, 2003636],
+        "Ubuntu IRC": [37966, 23501, 23468, 23205],
+        "Europarl": [69814, 69814, 69814, 69814],
+        "StackExchange": [23246548, 23246548, 23246352, 23246352],
+        "Arxiv": [1911867, 1869441, 1763840, 1762661],
+        "S2ORC": [12963563, 12963563, 12963563, 12963563],
+        "S2ORC Abstract": [102324176, 83867601, 82889293, 82777912],
+        "Pubmed Central": [5230932, 4830486, 4768310, 4767474],
+        "Pubmed Abstract": [25787474, 25784374, 25747955, 25746724],
+        "Phil Papers": [49389, 39175, 39175, 39128],
+    }
+    df = pd.DataFrame(data)
+    fig = go.Figure()
+    for dataset in df.columns[1:]:
+        fig.add_trace(go.Bar(name=dataset, x=df["Filter"], y=df[dataset]))
+    fig.update_layout(
+        barmode="stack",
+        title="Document Reduction by Filter for Each Dataset",
+        xaxis_title="Filter",
+        yaxis_title="Number of Lines",
+        legend_title="Dataset",
+        height=600,
+    )
+    return fig
 filtering_process = Div(
         P(
             "Below is a detail recount of how each dataset was extracted and filtered. If specific challenges were found with a dataset, they are included and discussed to the best of our abilities. The figure below provides a global view of the document filtering results. ~8% of documents were removed during these three steps."
         ),
+        plotly2fasthtml(diff2_stacked_bar()),
         H3(
             "This section continues below with the specific filtering steps taken for all 14 curated datasets."
         ),