Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Commit 
							
							·
						
						2c2527f
	
1
								Parent(s):
							
							5d485e5
								
everything in expanders
Browse files
    	
        app.py
    CHANGED
    
    | @@ -111,19 +111,24 @@ class Visualization: | |
| 111 | 
             
                    self.docs = self.docs_checkpoint
         | 
| 112 |  | 
| 113 | 
             
                def set_title(self):
         | 
| 114 | 
            -
                    st.title(f" | 
| 115 |  | 
| 116 | 
             
                @staticmethod
         | 
| 117 | 
             
                def plot_hist(dataframe, key, num_bins=50):
         | 
| 118 | 
            -
                    checkbox = st.checkbox( | 
|  | |
|  | |
| 119 | 
             
                    if checkbox:
         | 
| 120 | 
             
                        fig, ax = plt.subplots()
         | 
| 121 | 
             
                        val = dataframe[key[0]].values
         | 
| 122 | 
             
                        if np.median(val) != 0:
         | 
| 123 | 
            -
                            val = val[ | 
|  | |
|  | |
|  | |
| 124 | 
             
                        ax.hist(val, bins=num_bins, density=True)
         | 
| 125 | 
             
                        ax.set_title(" ".join(key[0].split("_")))
         | 
| 126 | 
            -
                        ax.axvline(x=key[1], color= | 
| 127 | 
             
                        st.pyplot(fig)
         | 
| 128 |  | 
| 129 | 
             
                def filtering_of_docs(self):
         | 
| @@ -273,9 +278,7 @@ class Visualization: | |
| 273 | 
             
                            with st.sidebar.expander("Perplexity score"):
         | 
| 274 | 
             
                                cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
         | 
| 275 | 
             
                                max_pp = int(np.max(self.docs["perplexity_score"])) + 1
         | 
| 276 | 
            -
                                cutoff_perplexity_score = st.slider(
         | 
| 277 | 
            -
                                    cutoff_def, 0, max_pp, max_pp
         | 
| 278 | 
            -
                                )
         | 
| 279 | 
             
                                new_key = ("perplexity_score", cutoff_perplexity_score, True)
         | 
| 280 | 
             
                                keys.append(new_key)
         | 
| 281 | 
             
                                Visualization.plot_hist(self.docs, new_key)
         | 
| @@ -291,80 +294,96 @@ class Visualization: | |
| 291 | 
             
                    all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
         | 
| 292 | 
             
                    all_conds = np.all(all_conds, axis=0)
         | 
| 293 |  | 
| 294 | 
            -
                    st. | 
| 295 | 
            -
             | 
| 296 | 
            -
                     | 
| 297 | 
            -
                         | 
| 298 | 
            -
             | 
| 299 | 
            -
                            f"{description}: {len(displayed_docs)} docs ({len(displayed_docs) / self.num_docs * 100:.2f}%)"
         | 
| 300 | 
             
                        )
         | 
| 301 | 
            -
             | 
| 302 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 303 | 
             
                        )
         | 
| 304 | 
            -
                        st.dataframe(displayed_docs)
         | 
| 305 |  | 
| 306 | 
            -
             | 
|  | |
| 307 |  | 
| 308 | 
            -
             | 
| 309 | 
            -
             | 
| 310 | 
            -
             | 
| 311 | 
            -
             | 
|  | |
|  | |
| 312 |  | 
| 313 | 
            -
             | 
| 314 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
| 315 |  | 
| 316 | 
            -
             | 
| 317 | 
            -
             | 
| 318 | 
            -
             | 
| 319 | 
            -
                                 | 
| 320 | 
            -
                                 | 
| 321 | 
            -
             | 
|  | |
|  | |
| 322 |  | 
| 323 | 
            -
             | 
| 324 | 
            -
             | 
| 325 | 
            -
             | 
| 326 | 
            -
             | 
| 327 | 
            -
             | 
| 328 | 
            -
             | 
| 329 |  | 
| 330 | 
            -
             | 
| 331 | 
            -
             | 
| 332 | 
            -
             | 
| 333 | 
            -
             | 
| 334 | 
            -
             | 
| 335 | 
            -
             | 
| 336 | 
            -
             | 
| 337 | 
            -
             | 
| 338 |  | 
| 339 | 
            -
             | 
| 340 | 
            -
             | 
| 341 | 
            -
             | 
| 342 | 
            -
             | 
| 343 | 
            -
             | 
| 344 | 
            -
             | 
| 345 |  | 
| 346 | 
            -
             | 
| 347 | 
            -
             | 
| 348 | 
            -
             | 
| 349 | 
            -
             | 
| 350 | 
            -
             | 
| 351 | 
            -
             | 
| 352 |  | 
| 353 | 
            -
                         | 
| 354 | 
            -
                            cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
         | 
| 355 | 
            -
                            display_dataset(
         | 
| 356 | 
            -
                                cond_filter,
         | 
| 357 | 
            -
                                "Discarded documents for the filter on the language identification confidence score",
         | 
| 358 | 
            -
                            )
         | 
| 359 |  | 
| 360 | 
            -
                         | 
| 361 | 
            -
                            cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
         | 
| 362 | 
            -
                            display_dataset(
         | 
| 363 | 
            -
                                cond_filter,
         | 
| 364 | 
            -
                                "Discarded documents for the filter on the perplexity score",
         | 
| 365 | 
            -
                            )
         | 
| 366 |  | 
| 367 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 368 |  | 
| 369 | 
             
                def filtering_of_words(self):
         | 
| 370 | 
             
                    if not (self.words is None):
         | 
| @@ -386,32 +405,39 @@ class Visualization: | |
| 386 |  | 
| 387 | 
             
                            cond_words = self.words["len_word"] <= cutoff_word
         | 
| 388 | 
             
                            if incorrect_substrings:
         | 
| 389 | 
            -
                                cond_words = cond_words & np.invert( | 
|  | |
|  | |
| 390 |  | 
| 391 | 
            -
                        st. | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 392 |  | 
| 393 | 
            -
             | 
| 394 | 
            -
             | 
| 395 | 
            -
             | 
| 396 | 
            -
             | 
| 397 |  | 
| 398 | 
            -
             | 
| 399 | 
            -
             | 
| 400 | 
            -
             | 
| 401 | 
            -
             | 
| 402 | 
            -
             | 
| 403 | 
            -
             | 
| 404 | 
            -
             | 
| 405 | 
            -
             | 
| 406 |  | 
| 407 | 
            -
             | 
| 408 | 
            -
             | 
| 409 | 
            -
             | 
| 410 | 
            -
             | 
| 411 | 
            -
             | 
| 412 | 
            -
             | 
| 413 | 
            -
             | 
| 414 | 
            -
             | 
| 415 |  | 
| 416 | 
             
                def download_parameters(self):
         | 
| 417 | 
             
                    st.sidebar.subheader("Download parameters")
         | 
| @@ -421,6 +447,7 @@ class Visualization: | |
| 421 | 
             
                        file_name=f"parameters_{self.lang_dataset_id}.json",
         | 
| 422 | 
             
                    )
         | 
| 423 |  | 
|  | |
| 424 | 
             
                def plot_zipf_law(self):
         | 
| 425 | 
             
                    if not (self.words is None):
         | 
| 426 | 
             
                        st.header("Zipf's Law")
         | 
| @@ -441,144 +468,136 @@ class Visualization: | |
| 441 | 
             
                            ax.set_xlabel("$i$-th most frequent word")
         | 
| 442 | 
             
                            ax.set_ylabel("frequency in the documents")
         | 
| 443 | 
             
                            st.pyplot(fig)
         | 
|  | |
| 444 |  | 
| 445 | 
             
                def analyse_personal_doc(self):
         | 
| 446 | 
            -
                    st. | 
|  | |
| 447 |  | 
| 448 | 
            -
             | 
| 449 | 
            -
             | 
| 450 | 
            -
             | 
| 451 | 
            -
             | 
| 452 | 
            -
             | 
| 453 |  | 
| 454 | 
            -
             | 
| 455 |  | 
| 456 | 
            -
             | 
| 457 | 
            -
             | 
| 458 | 
            -
             | 
| 459 | 
            -
             | 
| 460 | 
            -
             | 
| 461 |  | 
| 462 | 
            -
             | 
| 463 |  | 
| 464 | 
            -
             | 
| 465 |  | 
| 466 | 
            -
             | 
| 467 | 
            -
             | 
| 468 | 
            -
             | 
| 469 | 
            -
             | 
| 470 | 
            -
             | 
| 471 | 
            -
             | 
| 472 | 
            -
             | 
| 473 | 
            -
                                )
         | 
| 474 | 
            -
                                if key[2]:
         | 
| 475 | 
            -
                                    st.markdown(f"Number of words: {len(words)}")
         | 
| 476 | 
            -
                                if is_doc_discarded(key, len(words)):
         | 
| 477 | 
            -
                                    is_discarded = True
         | 
| 478 | 
            -
             | 
| 479 | 
            -
                            elif key[0] == "repetitions_ratio":
         | 
| 480 | 
            -
                                repetitions_ratio = Filtering.compute_repetitions_ratio(
         | 
| 481 | 
            -
                                    personal_doc, int(key[3])
         | 
| 482 | 
            -
                                )
         | 
| 483 | 
            -
                                repetitions_ratio = round(repetitions_ratio, 3)
         | 
| 484 | 
            -
                                st.markdown(f"Repetitions ratio: {repetitions_ratio}")
         | 
| 485 | 
            -
                                if is_doc_discarded(key, repetitions_ratio):
         | 
| 486 | 
            -
                                    is_discarded = True
         | 
| 487 | 
            -
             | 
| 488 | 
            -
                            elif key[0] == "special_characters_ratio":
         | 
| 489 | 
            -
                                special_characters_ratio = (
         | 
| 490 | 
            -
                                    Filtering.compute_special_characters_ratio(
         | 
| 491 | 
            -
                                        personal_doc, self.param["special_characters"]
         | 
| 492 | 
             
                                    )
         | 
| 493 | 
            -
             | 
| 494 | 
            -
             | 
| 495 | 
            -
             | 
| 496 | 
            -
             | 
| 497 | 
            -
             | 
| 498 | 
            -
             | 
| 499 | 
            -
             | 
| 500 | 
            -
             | 
| 501 | 
            -
                                     | 
| 502 | 
            -
                                     | 
| 503 | 
            -
                                     | 
| 504 | 
            -
                                     | 
| 505 | 
            -
             | 
| 506 | 
            -
             | 
| 507 | 
            -
             | 
| 508 | 
            -
             | 
| 509 | 
            -
             | 
| 510 | 
            -
             | 
| 511 | 
            -
             | 
| 512 | 
            -
                                     | 
| 513 | 
            -
             | 
| 514 | 
            -
             | 
| 515 | 
            -
             | 
| 516 | 
            -
                                     | 
| 517 | 
            -
                                     | 
| 518 | 
            -
             | 
| 519 | 
            -
             | 
| 520 | 
            -
             | 
| 521 | 
            -
                                     | 
| 522 | 
            -
             | 
| 523 | 
            -
             | 
| 524 | 
            -
             | 
| 525 | 
            -
             | 
| 526 | 
            -
             | 
| 527 | 
            -
             | 
| 528 | 
            -
             | 
| 529 | 
            -
             | 
| 530 | 
            -
             | 
| 531 | 
            -
                                     | 
| 532 | 
            -
                                     | 
| 533 | 
            -
             | 
| 534 | 
            -
             | 
| 535 | 
            -
                                 | 
| 536 | 
            -
             | 
| 537 | 
            -
             | 
| 538 | 
            -
             | 
| 539 | 
            -
             | 
| 540 | 
            -
             | 
| 541 | 
            -
             | 
| 542 | 
            -
             | 
| 543 | 
            -
             | 
| 544 | 
            -
             | 
| 545 | 
            -
             | 
| 546 | 
            -
             | 
| 547 | 
            -
                                     | 
| 548 | 
            -
             | 
| 549 | 
            -
             | 
| 550 | 
            -
                                 | 
| 551 | 
            -
             | 
| 552 | 
            -
             | 
| 553 | 
            -
             | 
| 554 | 
            -
                                     | 
| 555 | 
            -
             | 
| 556 | 
            -
             | 
| 557 | 
            -
             | 
| 558 | 
            -
             | 
| 559 | 
            -
             | 
| 560 | 
            -
             | 
| 561 | 
            -
             | 
| 562 | 
            -
             | 
| 563 | 
            -
             | 
| 564 | 
            -
             | 
| 565 | 
            -
             | 
| 566 | 
            -
             | 
| 567 | 
            -
             | 
| 568 | 
            -
             | 
| 569 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 570 |  | 
| 571 | 
             
                def visualization(self):
         | 
| 572 | 
            -
                    self.warning_preamble()
         | 
| 573 | 
             
                    self.preamble()
         | 
| 574 | 
             
                    self.open_data()
         | 
| 575 | 
             
                    self.set_title()
         | 
| 576 | 
             
                    self.filtering_of_docs()
         | 
| 577 | 
             
                    self.filtering_of_words()
         | 
| 578 | 
             
                    self.download_parameters()
         | 
| 579 | 
            -
                    # self.plot_zipf_law()
         | 
| 580 | 
             
                    self.analyse_personal_doc()
         | 
| 581 | 
            -
                    self.download_data()
         | 
| 582 |  | 
| 583 |  | 
| 584 | 
             
            path_instructions = "./explanation_filtering_pipeline.pdf"
         | 
|  | |
| 111 | 
             
                    self.docs = self.docs_checkpoint
         | 
| 112 |  | 
| 113 | 
             
                def set_title(self):
         | 
| 114 | 
            +
                    st.title(f"Filtering visualization")
         | 
| 115 |  | 
| 116 | 
             
                @staticmethod
         | 
| 117 | 
             
                def plot_hist(dataframe, key, num_bins=50):
         | 
| 118 | 
            +
                    checkbox = st.checkbox(
         | 
| 119 | 
            +
                        "Diplay distribution", value=True, key=f"display_distribution_{key[0]}"
         | 
| 120 | 
            +
                    )
         | 
| 121 | 
             
                    if checkbox:
         | 
| 122 | 
             
                        fig, ax = plt.subplots()
         | 
| 123 | 
             
                        val = dataframe[key[0]].values
         | 
| 124 | 
             
                        if np.median(val) != 0:
         | 
| 125 | 
            +
                            val = val[
         | 
| 126 | 
            +
                                abs(val - np.median(val))
         | 
| 127 | 
            +
                                < 9 * np.median(np.absolute(val - np.median(val)))
         | 
| 128 | 
            +
                            ]
         | 
| 129 | 
             
                        ax.hist(val, bins=num_bins, density=True)
         | 
| 130 | 
             
                        ax.set_title(" ".join(key[0].split("_")))
         | 
| 131 | 
            +
                        ax.axvline(x=key[1], color="r", linestyle="dashed")
         | 
| 132 | 
             
                        st.pyplot(fig)
         | 
| 133 |  | 
| 134 | 
             
                def filtering_of_docs(self):
         | 
|  | |
| 278 | 
             
                            with st.sidebar.expander("Perplexity score"):
         | 
| 279 | 
             
                                cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
         | 
| 280 | 
             
                                max_pp = int(np.max(self.docs["perplexity_score"])) + 1
         | 
| 281 | 
            +
                                cutoff_perplexity_score = st.slider(cutoff_def, 0, max_pp, max_pp)
         | 
|  | |
|  | |
| 282 | 
             
                                new_key = ("perplexity_score", cutoff_perplexity_score, True)
         | 
| 283 | 
             
                                keys.append(new_key)
         | 
| 284 | 
             
                                Visualization.plot_hist(self.docs, new_key)
         | 
|  | |
| 294 | 
             
                    all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
         | 
| 295 | 
             
                    all_conds = np.all(all_conds, axis=0)
         | 
| 296 |  | 
| 297 | 
            +
                    with st.expander(
         | 
| 298 | 
            +
                        f"Filtering on documents, for {self.num_docs} {self.lang} documents"
         | 
| 299 | 
            +
                    ):
         | 
| 300 | 
            +
                        st.header(
         | 
| 301 | 
            +
                            f"Filtering on documents, for {self.num_docs} {self.lang} documents"
         | 
|  | |
| 302 | 
             
                        )
         | 
| 303 | 
            +
             | 
| 304 | 
            +
                        def display_dataset(cond, description):
         | 
| 305 | 
            +
                            displayed_docs = self.docs.loc[cond]
         | 
| 306 | 
            +
                            st.subheader(
         | 
| 307 | 
            +
                                f"{description}: {len(displayed_docs)} docs ({len(displayed_docs) / self.num_docs * 100:.2f}%)"
         | 
| 308 | 
            +
                            )
         | 
| 309 | 
            +
                            st.markdown(
         | 
| 310 | 
            +
                                "Click on a column to sort by it, place the cursor on the text to display it."
         | 
| 311 | 
            +
                            )
         | 
| 312 | 
            +
                            st.dataframe(displayed_docs)
         | 
| 313 | 
            +
             | 
| 314 | 
            +
                        display_dataset(np.invert(all_conds), "Discarded documents")
         | 
| 315 | 
            +
             | 
| 316 | 
            +
                        # st.subheader("Display discarded documents by filter")
         | 
| 317 | 
            +
                        display_discarded_documents_by_filter = st.checkbox(
         | 
| 318 | 
            +
                            "Display discarded documents by filter"
         | 
| 319 | 
             
                        )
         | 
|  | |
| 320 |  | 
| 321 | 
            +
                        if display_discarded_documents_by_filter:
         | 
| 322 | 
            +
                            columns = list(self.docs)
         | 
| 323 |  | 
| 324 | 
            +
                            if "number_words" in columns:
         | 
| 325 | 
            +
                                cond_filter = np.invert(np.all(conds["number_words"], axis=0))
         | 
| 326 | 
            +
                                display_dataset(
         | 
| 327 | 
            +
                                    cond_filter,
         | 
| 328 | 
            +
                                    "Discarded documents for the filter on the number of words",
         | 
| 329 | 
            +
                                )
         | 
| 330 |  | 
| 331 | 
            +
                            if "repetitions_ratio" in columns:
         | 
| 332 | 
            +
                                cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
         | 
| 333 | 
            +
                                display_dataset(
         | 
| 334 | 
            +
                                    cond_filter,
         | 
| 335 | 
            +
                                    "Discarded documents for the filter on the repetitions ratio",
         | 
| 336 | 
            +
                                )
         | 
| 337 |  | 
| 338 | 
            +
                            if "special_characters_ratio" in columns:
         | 
| 339 | 
            +
                                cond_filter = np.invert(
         | 
| 340 | 
            +
                                    np.all(conds["special_characters_ratio"], axis=0)
         | 
| 341 | 
            +
                                )
         | 
| 342 | 
            +
                                display_dataset(
         | 
| 343 | 
            +
                                    cond_filter,
         | 
| 344 | 
            +
                                    "Discarded documents for the filter on the special characters ratio",
         | 
| 345 | 
            +
                                )
         | 
| 346 |  | 
| 347 | 
            +
                            if "stopwords_ratio" in columns:
         | 
| 348 | 
            +
                                cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
         | 
| 349 | 
            +
                                display_dataset(
         | 
| 350 | 
            +
                                    cond_filter,
         | 
| 351 | 
            +
                                    "Discarded documents for the filter on the stop words ratio",
         | 
| 352 | 
            +
                                )
         | 
| 353 |  | 
| 354 | 
            +
                            if "flagged_words_ratio" in columns:
         | 
| 355 | 
            +
                                cond_filter = np.invert(
         | 
| 356 | 
            +
                                    np.all(conds["flagged_words_ratio"], axis=0)
         | 
| 357 | 
            +
                                )
         | 
| 358 | 
            +
                                display_dataset(
         | 
| 359 | 
            +
                                    cond_filter,
         | 
| 360 | 
            +
                                    "Discarded documents for the filter on the flagged words ratio",
         | 
| 361 | 
            +
                                )
         | 
| 362 |  | 
| 363 | 
            +
                            if "lang_id_score" in columns:
         | 
| 364 | 
            +
                                cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
         | 
| 365 | 
            +
                                display_dataset(
         | 
| 366 | 
            +
                                    cond_filter,
         | 
| 367 | 
            +
                                    "Discarded documents for the filter on the language identification confidence score",
         | 
| 368 | 
            +
                                )
         | 
| 369 |  | 
| 370 | 
            +
                            if "perplexity_score" in columns:
         | 
| 371 | 
            +
                                cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
         | 
| 372 | 
            +
                                display_dataset(
         | 
| 373 | 
            +
                                    cond_filter,
         | 
| 374 | 
            +
                                    "Discarded documents for the filter on the perplexity score",
         | 
| 375 | 
            +
                                )
         | 
| 376 |  | 
| 377 | 
            +
                        display_dataset(all_conds, "Retained documents")
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 378 |  | 
| 379 | 
            +
                        st.header("Download data")
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 380 |  | 
| 381 | 
            +
                        with open(self.path_data) as json_file:
         | 
| 382 | 
            +
                            btn = st.download_button(
         | 
| 383 | 
            +
                                label="Download data as json",
         | 
| 384 | 
            +
                                data=json_file,
         | 
| 385 | 
            +
                                file_name="data.json",
         | 
| 386 | 
            +
                            )
         | 
| 387 |  | 
| 388 | 
             
                def filtering_of_words(self):
         | 
| 389 | 
             
                    if not (self.words is None):
         | 
|  | |
| 405 |  | 
| 406 | 
             
                            cond_words = self.words["len_word"] <= cutoff_word
         | 
| 407 | 
             
                            if incorrect_substrings:
         | 
| 408 | 
            +
                                cond_words = cond_words & np.invert(
         | 
| 409 | 
            +
                                    self.words["incorrect_substring"]
         | 
| 410 | 
            +
                                )
         | 
| 411 |  | 
| 412 | 
            +
                        with st.expander(
         | 
| 413 | 
            +
                            f"Filtering on words, for {self.num_docs} {self.lang} documents"
         | 
| 414 | 
            +
                        ):
         | 
| 415 | 
            +
                            st.header(
         | 
| 416 | 
            +
                                f"Filtering on words, for {self.num_docs} {self.lang} documents"
         | 
| 417 | 
            +
                            )
         | 
| 418 |  | 
| 419 | 
            +
                            st.markdown(
         | 
| 420 | 
            +
                                f"Since the number of words is way larger than the number of documents, "
         | 
| 421 | 
            +
                                f"we consider in this section words for the first {self.num_docs_for_words} documents only."
         | 
| 422 | 
            +
                            )
         | 
| 423 |  | 
| 424 | 
            +
                            discarded_words = self.words.loc[np.invert(cond_words)]
         | 
| 425 | 
            +
                            st.subheader(
         | 
| 426 | 
            +
                                f"Discarded words: {len(discarded_words)} words ({len(discarded_words) / len(self.words) * 100:.2f}%)"
         | 
| 427 | 
            +
                            )
         | 
| 428 | 
            +
                            st.markdown(
         | 
| 429 | 
            +
                                "Click on a column to sort by it, place the cursor on the text to display it."
         | 
| 430 | 
            +
                            )
         | 
| 431 | 
            +
                            st.dataframe(discarded_words)
         | 
| 432 |  | 
| 433 | 
            +
                            retained_words = self.words.loc[cond_words]
         | 
| 434 | 
            +
                            st.subheader(
         | 
| 435 | 
            +
                                f"Retained words: {len(retained_words)} words ({len(retained_words) / len(self.words) * 100:.2f}%)"
         | 
| 436 | 
            +
                            )
         | 
| 437 | 
            +
                            st.markdown(
         | 
| 438 | 
            +
                                "Click on a column to sort by it, place the cursor on the text to display it."
         | 
| 439 | 
            +
                            )
         | 
| 440 | 
            +
                            st.dataframe(retained_words)
         | 
| 441 |  | 
| 442 | 
             
                def download_parameters(self):
         | 
| 443 | 
             
                    st.sidebar.subheader("Download parameters")
         | 
|  | |
| 447 | 
             
                        file_name=f"parameters_{self.lang_dataset_id}.json",
         | 
| 448 | 
             
                    )
         | 
| 449 |  | 
| 450 | 
            +
                """
         | 
| 451 | 
             
                def plot_zipf_law(self):
         | 
| 452 | 
             
                    if not (self.words is None):
         | 
| 453 | 
             
                        st.header("Zipf's Law")
         | 
|  | |
| 468 | 
             
                            ax.set_xlabel("$i$-th most frequent word")
         | 
| 469 | 
             
                            ax.set_ylabel("frequency in the documents")
         | 
| 470 | 
             
                            st.pyplot(fig)
         | 
| 471 | 
            +
                """
         | 
| 472 |  | 
| 473 | 
             
                def analyse_personal_doc(self):
         | 
| 474 | 
            +
                    with st.expander("Analyse your own document"):
         | 
| 475 | 
            +
                        st.header("Analyse your own document")
         | 
| 476 |  | 
| 477 | 
            +
                        personal_doc = st.text_area(
         | 
| 478 | 
            +
                            label="Paste here the document you want to analyse",
         | 
| 479 | 
            +
                            value="",
         | 
| 480 | 
            +
                            max_chars=10000,
         | 
| 481 | 
            +
                        )
         | 
| 482 |  | 
| 483 | 
            +
                        is_discarded = False
         | 
| 484 |  | 
| 485 | 
            +
                        def is_doc_discarded(key, score):
         | 
| 486 | 
            +
                            if key[2]:  # max cutoff
         | 
| 487 | 
            +
                                return score > key[1]
         | 
| 488 | 
            +
                            else:
         | 
| 489 | 
            +
                                return score < key[1]
         | 
| 490 |  | 
| 491 | 
            +
                        if personal_doc:
         | 
| 492 |  | 
| 493 | 
            +
                            st.markdown("Statistics of the document:")
         | 
| 494 |  | 
| 495 | 
            +
                            for key in self.keys:
         | 
| 496 | 
            +
                                if key[0] == "number_words":
         | 
| 497 | 
            +
                                    words = ModifyingDocuments.get_words_from_document(
         | 
| 498 | 
            +
                                        personal_doc,
         | 
| 499 | 
            +
                                        self.sentencepiece_model_tok,
         | 
| 500 | 
            +
                                        lower_case=False,
         | 
| 501 | 
            +
                                        strip_characters=self.param["strip_characters"],
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 502 | 
             
                                    )
         | 
| 503 | 
            +
                                    if key[2]:
         | 
| 504 | 
            +
                                        st.markdown(f"Number of words: {len(words)}")
         | 
| 505 | 
            +
                                    if is_doc_discarded(key, len(words)):
         | 
| 506 | 
            +
                                        is_discarded = True
         | 
| 507 | 
            +
             | 
| 508 | 
            +
                                elif key[0] == "repetitions_ratio":
         | 
| 509 | 
            +
                                    repetitions_ratio = Filtering.compute_repetitions_ratio(
         | 
| 510 | 
            +
                                        personal_doc, int(key[3])
         | 
| 511 | 
            +
                                    )
         | 
| 512 | 
            +
                                    repetitions_ratio = round(repetitions_ratio, 3)
         | 
| 513 | 
            +
                                    st.markdown(f"Repetitions ratio: {repetitions_ratio}")
         | 
| 514 | 
            +
                                    if is_doc_discarded(key, repetitions_ratio):
         | 
| 515 | 
            +
                                        is_discarded = True
         | 
| 516 | 
            +
             | 
| 517 | 
            +
                                elif key[0] == "special_characters_ratio":
         | 
| 518 | 
            +
                                    special_characters_ratio = (
         | 
| 519 | 
            +
                                        Filtering.compute_special_characters_ratio(
         | 
| 520 | 
            +
                                            personal_doc, self.param["special_characters"]
         | 
| 521 | 
            +
                                        )
         | 
| 522 | 
            +
                                    )
         | 
| 523 | 
            +
                                    special_characters_ratio = round(special_characters_ratio, 3)
         | 
| 524 | 
            +
                                    st.markdown(
         | 
| 525 | 
            +
                                        f"Special characters ratio: {special_characters_ratio}"
         | 
| 526 | 
            +
                                    )
         | 
| 527 | 
            +
                                    if is_doc_discarded(key, special_characters_ratio):
         | 
| 528 | 
            +
                                        is_discarded = True
         | 
| 529 | 
            +
             | 
| 530 | 
            +
                                elif key[0] == "stopwords_ratio":
         | 
| 531 | 
            +
                                    stopwords_ratio = Filtering.compute_stopwords_ratio(
         | 
| 532 | 
            +
                                        personal_doc,
         | 
| 533 | 
            +
                                        self.sentencepiece_model_tok,
         | 
| 534 | 
            +
                                        self.param["strip_characters"],
         | 
| 535 | 
            +
                                        self.param["cond_words_augmentation"],
         | 
| 536 | 
            +
                                        self.param["words_augmentation_group_sizes"],
         | 
| 537 | 
            +
                                        self.param["words_augmentation_join_char"],
         | 
| 538 | 
            +
                                        self.stopwords,
         | 
| 539 | 
            +
                                    )
         | 
| 540 | 
            +
                                    stopwords_ratio = round(stopwords_ratio, 3)
         | 
| 541 | 
            +
                                    st.markdown(f"Stop words ratio: {stopwords_ratio}")
         | 
| 542 | 
            +
                                    if is_doc_discarded(key, stopwords_ratio):
         | 
| 543 | 
            +
                                        is_discarded = True
         | 
| 544 | 
            +
             | 
| 545 | 
            +
                                elif key[0] == "flagged_words_ratio":
         | 
| 546 | 
            +
                                    flagged_words_ratio = Filtering.compute_flagged_words_ratio(
         | 
| 547 | 
            +
                                        personal_doc,
         | 
| 548 | 
            +
                                        self.sentencepiece_model_tok,
         | 
| 549 | 
            +
                                        self.param["strip_characters"],
         | 
| 550 | 
            +
                                        self.param["cond_words_augmentation"],
         | 
| 551 | 
            +
                                        self.param["words_augmentation_group_sizes"],
         | 
| 552 | 
            +
                                        self.param["words_augmentation_join_char"],
         | 
| 553 | 
            +
                                        self.flagged_words,
         | 
| 554 | 
            +
                                    )
         | 
| 555 | 
            +
                                    flagged_words_ratio = round(flagged_words_ratio, 3)
         | 
| 556 | 
            +
                                    st.markdown(f"Flagged words ratio: {flagged_words_ratio}")
         | 
| 557 | 
            +
                                    if is_doc_discarded(key, flagged_words_ratio):
         | 
| 558 | 
            +
                                        is_discarded = True
         | 
| 559 | 
            +
             | 
| 560 | 
            +
                                elif key[0] == "lang_id_score":
         | 
| 561 | 
            +
                                    (
         | 
| 562 | 
            +
                                        lang_pred_dataset_id,
         | 
| 563 | 
            +
                                        lang_id_score,
         | 
| 564 | 
            +
                                    ) = Filtering.compute_lang_id_pred_score(
         | 
| 565 | 
            +
                                        personal_doc, self.model_lang_id
         | 
| 566 | 
            +
                                    )
         | 
| 567 | 
            +
                                    lang_id_score = round(lang_id_score, 3)
         | 
| 568 | 
            +
                                    st.markdown(
         | 
| 569 | 
            +
                                        f"Language identification confidence score: {lang_id_score}"
         | 
| 570 | 
            +
                                    )
         | 
| 571 | 
            +
                                    if is_doc_discarded(key, flagged_words_ratio) or (
         | 
| 572 | 
            +
                                        self.lang_dataset_id != lang_pred_dataset_id
         | 
| 573 | 
            +
                                    ):
         | 
| 574 | 
            +
                                        is_discarded = True
         | 
| 575 | 
            +
             | 
| 576 | 
            +
                                elif key[0] == "perplexity_score":
         | 
| 577 | 
            +
                                    perplexity_score = Filtering.compute_perplexity_score(
         | 
| 578 | 
            +
                                        personal_doc,
         | 
| 579 | 
            +
                                        self.sentencepiece_model,
         | 
| 580 | 
            +
                                        self.kenlm_model,
         | 
| 581 | 
            +
                                    )
         | 
| 582 | 
            +
                                    perplexity_score = round(perplexity_score, 3)
         | 
| 583 | 
            +
                                    st.markdown(f"Perplexity score: {perplexity_score}")
         | 
| 584 | 
            +
                                    if is_doc_discarded(key, perplexity_score):
         | 
| 585 | 
            +
                                        is_discarded = True
         | 
| 586 | 
            +
             | 
| 587 | 
            +
                            is_discarded = "" if is_discarded else "not "
         | 
| 588 | 
            +
                            st.markdown(
         | 
| 589 | 
            +
                                f"With the current filtering parameters, this document **is {is_discarded}discarded**."
         | 
| 590 | 
            +
                            )
         | 
| 591 |  | 
| 592 | 
             
                def visualization(self):
         | 
| 593 | 
            +
                    # self.warning_preamble()
         | 
| 594 | 
             
                    self.preamble()
         | 
| 595 | 
             
                    self.open_data()
         | 
| 596 | 
             
                    self.set_title()
         | 
| 597 | 
             
                    self.filtering_of_docs()
         | 
| 598 | 
             
                    self.filtering_of_words()
         | 
| 599 | 
             
                    self.download_parameters()
         | 
|  | |
| 600 | 
             
                    self.analyse_personal_doc()
         | 
|  | |
| 601 |  | 
| 602 |  | 
| 603 | 
             
            path_instructions = "./explanation_filtering_pipeline.pdf"
         | 

