Commit
·
0698fac
1
Parent(s):
9a127b5
fix curated page layout. remove fixed width of the chart
Browse files- curated.py +39 -44
curated.py
CHANGED
|
@@ -579,54 +579,49 @@ data_preprocessing_div = Div(
|
|
| 579 |
),
|
| 580 |
)
|
| 581 |
|
| 582 |
-
# Data for the stacked bar chart
|
| 583 |
-
data = {
|
| 584 |
-
"Filter": [
|
| 585 |
-
"Downloaded Lines",
|
| 586 |
-
"Language Filter",
|
| 587 |
-
"Min Word Count",
|
| 588 |
-
"Unigram Log Probability",
|
| 589 |
-
],
|
| 590 |
-
"Wikipedia": [61614907, 61614907, 60468491, 60468491],
|
| 591 |
-
"Freelaw": [75971288, 73690766, 68171834, 68123174],
|
| 592 |
-
"DM Maths": [112559888, 112559888, 112559888, 112559888],
|
| 593 |
-
"USPTO": [6880276, 6878964, 6749922, 6749389],
|
| 594 |
-
"PG19": [28752, 28683, 28682, 28632],
|
| 595 |
-
"Hackernews": [2064931, 2010802, 2010488, 2003636],
|
| 596 |
-
"Ubuntu IRC": [37966, 23501, 23468, 23205],
|
| 597 |
-
"Europarl": [69814, 69814, 69814, 69814],
|
| 598 |
-
"StackExchange": [23246548, 23246548, 23246352, 23246352],
|
| 599 |
-
"Arxiv": [1911867, 1869441, 1763840, 1762661],
|
| 600 |
-
"S2ORC": [12963563, 12963563, 12963563, 12963563],
|
| 601 |
-
"S2ORC Abstract": [102324176, 83867601, 82889293, 82777912],
|
| 602 |
-
"Pubmed Central": [5230932, 4830486, 4768310, 4767474],
|
| 603 |
-
"Pubmed Abstract": [25787474, 25784374, 25747955, 25746724],
|
| 604 |
-
"Phil Papers": [49389, 39175, 39175, 39128],
|
| 605 |
-
}
|
| 606 |
|
| 607 |
-
|
| 608 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 609 |
|
| 610 |
-
|
| 611 |
-
fig = go.Figure()
|
| 612 |
|
| 613 |
-
|
| 614 |
-
for dataset in df.columns[1:]:
|
| 615 |
-
fig.add_trace(go.Bar(name=dataset, x=df["Filter"], y=df[dataset]))
|
| 616 |
|
| 617 |
-
|
| 618 |
-
fig.
|
| 619 |
-
barmode="stack",
|
| 620 |
-
title="Document Reduction by Filter for Each Dataset",
|
| 621 |
-
xaxis_title="Filter",
|
| 622 |
-
yaxis_title="Number of Lines",
|
| 623 |
-
legend_title="Dataset",
|
| 624 |
-
height=600,
|
| 625 |
-
width=1000,
|
| 626 |
-
)
|
| 627 |
|
| 628 |
-
|
| 629 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 630 |
|
| 631 |
|
| 632 |
filtering_process = Div(
|
|
@@ -635,7 +630,7 @@ filtering_process = Div(
|
|
| 635 |
P(
|
| 636 |
"Below is a detail recount of how each dataset was extracted and filtered. If specific challenges were found with a dataset, they are included and discussed to the best of our abilities. The figure below provides a global view of the document filtering results. ~8% of documents were removed during these three steps."
|
| 637 |
),
|
| 638 |
-
plotly2fasthtml(diff2_stacked_bar),
|
| 639 |
H3(
|
| 640 |
"This section continues below with the specific filtering steps taken for all 14 curated datasets."
|
| 641 |
),
|
|
|
|
| 579 |
),
|
| 580 |
)
|
| 581 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 582 |
|
| 583 |
+
def diff2_stacked_bar():
|
| 584 |
+
# Data for the stacked bar chart
|
| 585 |
+
data = {
|
| 586 |
+
"Filter": [
|
| 587 |
+
"Downloaded Lines",
|
| 588 |
+
"Language Filter",
|
| 589 |
+
"Min Word Count",
|
| 590 |
+
"Unigram Log Probability",
|
| 591 |
+
],
|
| 592 |
+
"Wikipedia": [61614907, 61614907, 60468491, 60468491],
|
| 593 |
+
"Freelaw": [75971288, 73690766, 68171834, 68123174],
|
| 594 |
+
"DM Maths": [112559888, 112559888, 112559888, 112559888],
|
| 595 |
+
"USPTO": [6880276, 6878964, 6749922, 6749389],
|
| 596 |
+
"PG19": [28752, 28683, 28682, 28632],
|
| 597 |
+
"Hackernews": [2064931, 2010802, 2010488, 2003636],
|
| 598 |
+
"Ubuntu IRC": [37966, 23501, 23468, 23205],
|
| 599 |
+
"Europarl": [69814, 69814, 69814, 69814],
|
| 600 |
+
"StackExchange": [23246548, 23246548, 23246352, 23246352],
|
| 601 |
+
"Arxiv": [1911867, 1869441, 1763840, 1762661],
|
| 602 |
+
"S2ORC": [12963563, 12963563, 12963563, 12963563],
|
| 603 |
+
"S2ORC Abstract": [102324176, 83867601, 82889293, 82777912],
|
| 604 |
+
"Pubmed Central": [5230932, 4830486, 4768310, 4767474],
|
| 605 |
+
"Pubmed Abstract": [25787474, 25784374, 25747955, 25746724],
|
| 606 |
+
"Phil Papers": [49389, 39175, 39175, 39128],
|
| 607 |
+
}
|
| 608 |
|
| 609 |
+
df = pd.DataFrame(data)
|
|
|
|
| 610 |
|
| 611 |
+
fig = go.Figure()
|
|
|
|
|
|
|
| 612 |
|
| 613 |
+
for dataset in df.columns[1:]:
|
| 614 |
+
fig.add_trace(go.Bar(name=dataset, x=df["Filter"], y=df[dataset]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
|
| 616 |
+
fig.update_layout(
|
| 617 |
+
barmode="stack",
|
| 618 |
+
title="Document Reduction by Filter for Each Dataset",
|
| 619 |
+
xaxis_title="Filter",
|
| 620 |
+
yaxis_title="Number of Lines",
|
| 621 |
+
legend_title="Dataset",
|
| 622 |
+
height=600,
|
| 623 |
+
)
|
| 624 |
+
return fig
|
| 625 |
|
| 626 |
|
| 627 |
filtering_process = Div(
|
|
|
|
| 630 |
P(
|
| 631 |
"Below is a detail recount of how each dataset was extracted and filtered. If specific challenges were found with a dataset, they are included and discussed to the best of our abilities. The figure below provides a global view of the document filtering results. ~8% of documents were removed during these three steps."
|
| 632 |
),
|
| 633 |
+
plotly2fasthtml(diff2_stacked_bar()),
|
| 634 |
H3(
|
| 635 |
"This section continues below with the specific filtering steps taken for all 14 curated datasets."
|
| 636 |
),
|