Spaces:
Running
Running
Update curated.py
Browse files- curated.py +11 -10
curated.py
CHANGED
|
@@ -673,16 +673,17 @@ filtering_process = Div(
|
|
| 673 |
H4("Download and Extraction"),
|
| 674 |
P("The dataset was downloaded from:", A("https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/", href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/"), " based on the year."),
|
| 675 |
P("During extraction, the logs were cleaned using following functions:"),
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
#
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
|
|
|
| 686 |
H4("Filtering"),
|
| 687 |
Ol(
|
| 688 |
Li("Language Filter: English"),
|
|
|
|
| 673 |
H4("Download and Extraction"),
|
| 674 |
P("The dataset was downloaded from:", A("https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/", href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/"), " based on the year."),
|
| 675 |
P("During extraction, the logs were cleaned using following functions:"),
|
| 676 |
+
D_code("""
|
| 677 |
+
def exclude_system(x):
|
| 678 |
+
return '\n'.join(line for line in x.split('\n') if not line.startswith('==='))
|
| 679 |
+
|
| 680 |
+
def exclude_select_system(x):
|
| 681 |
+
return '\n'.join(line for line in x.split('\n') if not (line.startswith('===')
|
| 682 |
+
and any(term in line for term in ['has joined #', 'has left #', 'Topic for #', "Topic (#", "is now known as"]) ))
|
| 683 |
+
|
| 684 |
+
def clean(x):
|
| 685 |
+
return '\n'.join('* ' + line[4:] if line.startswith('===') else line[8:] for line in x.split('\n'))
|
| 686 |
+
""", block="block", language="python" ),
|
| 687 |
H4("Filtering"),
|
| 688 |
Ol(
|
| 689 |
Li("Language Filter: English"),
|